Spaces:

MrSimple01
/

RAG_AIEXP_01

Running

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

d1e7fd2

1 Parent(s): 9da507d

new documents prep

Browse files

Files changed (3) hide show

documents_prep.py +501 -439
table_prep.py +9 -7
utils.py +0 -93

documents_prep.py CHANGED Viewed

@@ -3,486 +3,548 @@ import zipfile
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
-from my_logging import log_message
 from llama_index.core.text_splitter import SentenceSplitter
-from config import CHUNK_SIZE, CHUNK_OVERLAP
-from table_prep import table_to_document, load_table_data
-def chunk_document(doc, chunk_size=None, chunk_overlap=None):
-    if chunk_size is None:
-        chunk_size = CHUNK_SIZE
-    if chunk_overlap is None:
-        chunk_overlap = CHUNK_OVERLAP
     text_splitter = SentenceSplitter(
-        chunk_size=chunk_size,
-        chunk_overlap=chunk_overlap,
-        separator=" "
     )
-    text_chunks = text_splitter.split_text(doc.text)
-    chunked_docs = []
-    for i, chunk_text in enumerate(text_chunks):
-        chunk_metadata = doc.metadata.copy()
-        chunk_metadata.update({
-            "chunk_id": i,
-            "total_chunks": len(text_chunks),
-            "chunk_size": len(chunk_text),
-            "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
-        })
-        chunked_doc = Document(
-            text=chunk_text,
-            metadata=chunk_metadata
-        )
-        chunked_docs.append(chunked_doc)
-    return chunked_docs
-def process_documents_with_chunking(documents):
-    all_chunked_docs = []
-    chunk_info = []
-    table_count = 0
-    table_chunks_count = 0
-    image_count = 0
-    image_chunks_count = 0
-    text_chunks_count = 0
-    for doc in documents:
-        doc_type = doc.metadata.get('type', 'text')
-        is_already_chunked = doc.metadata.get('is_chunked', False)
-        if doc_type == 'table':
-            if is_already_chunked:
-                table_chunks_count += 1
-                all_chunked_docs.append(doc)
-                chunk_info.append({
-                    'document_id': doc.metadata.get('document_id', 'unknown'),
-                    'section_id': doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': doc.metadata.get('chunk_id', 0),
-                    'total_chunks': doc.metadata.get('total_chunks', 1),
-                    'chunk_size': len(doc.text),
-                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                    'type': 'table',
-                    'table_number': doc.metadata.get('table_number', 'unknown')
-                })
-            else:
-                table_count += 1
-                all_chunked_docs.append(doc)
-                chunk_info.append({
-                    'document_id': doc.metadata.get('document_id', 'unknown'),
-                    'section_id': doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': 0,
-                    'chunk_size': len(doc.text),
-                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                    'type': 'table',
-                    'table_number': doc.metadata.get('table_number', 'unknown')
-                })
-        elif doc_type == 'image':
-            image_count += 1
-            doc_size = len(doc.text)
-            if doc_size > CHUNK_SIZE:
-                log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
-                           f"Размер: {doc_size} > {CHUNK_SIZE}")
-                chunked_docs = chunk_document(doc)
-                image_chunks_count += len(chunked_docs)
-                all_chunked_docs.extend(chunked_docs)
-                log_message(f"  ✂️ Разделено на {len(chunked_docs)} чанк��в")
-                for i, chunk_doc in enumerate(chunked_docs):
-                    chunk_info.append({
-                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
-                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
-                        'chunk_id': i,
-                        'chunk_size': len(chunk_doc.text),
-                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
-                        'type': 'image',
-                        'image_number': chunk_doc.metadata.get('image_number', 'unknown')
-                    })
-            else:
-                all_chunked_docs.append(doc)
-                chunk_info.append({
-                    'document_id': doc.metadata.get('document_id', 'unknown'),
-                    'section_id': doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': 0,
-                    'chunk_size': doc_size,
-                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                    'type': 'image',
-                    'image_number': doc.metadata.get('image_number', 'unknown')
-                })
         else:
-            doc_size = len(doc.text)
-            if doc_size > CHUNK_SIZE:
-                log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
-                           f"Размер: {doc_size} > {CHUNK_SIZE}")
-                chunked_docs = chunk_document(doc)
-                text_chunks_count += len(chunked_docs)
-                all_chunked_docs.extend(chunked_docs)
-                log_message(f"  ✂️ Разделен на {len(chunked_docs)} чанков")
-                for i, chunk_doc in enumerate(chunked_docs):
-                    chunk_info.append({
-                        'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
-                        'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
-                        'chunk_id': i,
-                        'chunk_size': len(chunk_doc.text),
-                        'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
-                        'type': 'text'
-                    })
-            else:
-                all_chunked_docs.append(doc)
-                chunk_info.append({
-                    'document_id': doc.metadata.get('document_id', 'unknown'),
-                    'section_id': doc.metadata.get('section_id', 'unknown'),
-                    'chunk_id': 0,
-                    'chunk_size': doc_size,
-                    'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
-                    'type': 'text'
-                })
-    log_message(f"\n{'='*60}")
-    log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
-    log_message(f"  • Таблицы (целые): {table_count}")
-    log_message(f"  • Таблицы (чанки): {table_chunks_count}")
-    log_message(f"  • Изображения (целые): {image_count - (image_chunks_count > 0)}")
-    log_message(f"  • Изображения (чанки): {image_chunks_count}")
-    log_message(f"  • Текстовые чанки: {text_chunks_count}")
-    log_message(f"  • Всего документов: {len(all_chunked_docs)}")
-    log_message(f"{'='*60}\n")
-    return all_chunked_docs, chunk_info
-def extract_text_from_json(data, document_id, document_name):
-    documents = []
-    if 'sections' in data:
-        for section in data['sections']:
-            section_id = section.get('section_id', 'Unknown')
-            section_text = section.get('section_text', '')
-            section_path = f"{section_id}"
-            section_title = extract_section_title(section_text)
-            if section_text.strip():
-                doc = Document(
-                    text=section_text,
-                    metadata={
-                        "type": "text",
-                        "document_id": document_id,
-                        "document_name": document_name,
-                        "section_id": section_id,
-                        "section_text": section_title[:200],
-                        "section_path": section_path,
-                        "level": "section"
-                    }
-                )
-                documents.append(doc)
-            if 'subsections' in section:
-                for subsection in section['subsections']:
-                    subsection_id = subsection.get('subsection_id', 'Unknown')
-                    subsection_text = subsection.get('subsection_text', '')
-                    subsection_title = extract_section_title(subsection_text)
-                    subsection_path = f"{section_path}.{subsection_id}"
-                    if subsection_text.strip():
-                        doc = Document(
-                            text=subsection_text,
-                            metadata={
-                                "type": "text",
-                                "document_id": document_id,
-                                "document_name": document_name,
-                                "section_id": subsection_id,
-                                "section_text": subsection_title[:200],
-                                "section_path": subsection_path,
-                                "level": "subsection",
-                                "parent_section": section_id,
-                                "parent_title": section_title[:100]
-                            }
-                        )
-                        documents.append(doc)
-                    if 'sub_subsections' in subsection:
-                        for sub_subsection in subsection['sub_subsections']:
-                            sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
-                            sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
-                            sub_subsection_title = extract_section_title(sub_subsection_text)
-                            sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
-                            if sub_subsection_text.strip():
-                                doc = Document(
-                                    text=sub_subsection_text,
-                                    metadata={
-                                        "type": "text",
-                                        "document_id": document_id,
-                                        "document_name": document_name,
-                                        "section_id": sub_subsection_id,
-                                        "section_text": sub_subsection_title[:200],
-                                        "section_path": sub_subsection_path,
-                                        "level": "sub_subsection",
-                                        "parent_section": subsection_id,
-                                        "parent_title": subsection_title[:100]
-                                    }
-                                )
-                                documents.append(doc)
-                            if 'sub_sub_subsections' in sub_subsection:
-                                for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
-                                    sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
-                                    sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
-                                    sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
-                                    if sub_sub_subsection_text.strip():
-                                        doc = Document(
-                                            text=sub_sub_subsection_text,
-                                            metadata={
-                                                "type": "text",
-                                                "document_id": document_id,
-                                                "document_name": document_name,
-                                                "section_id": sub_sub_subsection_id,
-                                                "section_text": sub_sub_subsection_title[:200],
-                                                "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
-                                                "level": "sub_sub_subsection",
-                                                "parent_section": sub_subsection_id,
-                                                "parent_title": sub_subsection_title[:100]
-                                            }
-                                        )
-                                        documents.append(doc)
-    return documents
-def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
-    log_message("Начинаю загрузку JSON документов")
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
-        json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
-        all_documents = []
-        for zip_file_path in zip_files:
-            try:
-                log_message(f"Загружаю ZIP архив: {zip_file_path}")
-                local_zip_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=zip_file_path,
-                    local_dir=download_dir,
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                documents = extract_zip_and_process_json(local_zip_path)
-                all_documents.extend(documents)
-                log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
-            except Exception as e:
-                log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
-                continue
-        for file_path in json_files:
-            try:
-                log_message(f"Обрабатываю прямой JSON файл: {file_path}")
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir=download_dir,
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                with open(local_path, 'r', encoding='utf-8') as f:
-                    json_data = json.load(f)
-                document_metadata = json_data.get('document_metadata', {})
-                document_id = document_metadata.get('document_id', 'unknown')
-                document_name = document_metadata.get('document_name', 'unknown')
-                documents = extract_text_from_json(json_data, document_id, document_name)
-                all_documents.extend(documents)
-                log_message(f"Извлечено {len(documents)} документов из {file_path}")
-            except Exception as e:
-                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
-                continue
-        log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
-        # Process documents through chunking function
-        chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
-        log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
-        return chunked_documents, chunk_info
-    except Exception as e:
-        log_message(f"Ошибка загрузки JSON документов: {str(e)}")
-        return [], []
-def extract_section_title(section_text):
-    if not section_text.strip():
-        return ""
-    lines = section_text.strip().split('\n')
-    first_line = lines[0].strip()
-    if len(first_line) < 200 and not first_line.endswith('.'):
-        return first_line
-    # Otherwise, extract first sentence
-    sentences = first_line.split('.')
-    if len(sentences) > 1:
-        return sentences[0].strip()
-    return first_line[:100] + "..." if len(first_line) > 100 else first_line
-def extract_zip_and_process_json(zip_path):
     documents = []
-    try:
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_files = zip_ref.namelist()
-            json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
-            log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
-            for json_file in json_files:
-                try:
-                    log_message(f"Обрабатываю файл из архива: {json_file}")
-                    with zip_ref.open(json_file) as f:
-                        json_data = json.load(f)
-                    document_metadata = json_data.get('document_metadata', {})
-                    document_id = document_metadata.get('document_id', 'unknown')
-                    document_name = document_metadata.get('document_name', 'unknown')
-                    docs = extract_text_from_json(json_data, document_id, document_name)
-                    documents.extend(docs)
-                    log_message(f"Извлечено {len(docs)} документов из {json_file}")
-                except Exception as e:
-                    log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
-                    continue
-    except Exception as e:
-        log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
     return documents
-def load_image_data(repo_id, hf_token, image_data_dir):
-    log_message("Начинаю загрузку данных изображений")
-    image_files = []
     try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        for file in files:
-            if file.startswith(image_data_dir) and file.endswith('.csv'):
-                image_files.append(file)
-        log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
-        image_documents = []
-        for file_path in image_files:
-            try:
-                log_message(f"Обрабатываю файл изображений: {file_path}")
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir='',
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                df = pd.read_csv(local_path)
-                log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
-                # Обработка с правильными названиями колонок
-                for _, row in df.iterrows():
-                    section_value = row.get('Раздел документа', 'Неизвестно')
-                    content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
-                    content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
-                    content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"  # Опечатка в назва��ии колонки
-                    content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
-                    content += f"Раздел: {section_value}\n"
-                    content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
-                    doc = Document(
-                        text=content,
                         metadata={
-                            "type": "image",
-                            "image_number": str(row.get('№ Изображения', 'unknown')),
-                            "image_title": str(row.get('Название изображения', 'unknown')),
-                            "image_description": str(row.get('Описание изображение', 'unknown')),
-                            "document_id": str(row.get('Обозначение документа', 'unknown')),
-                            "file_path": str(row.get('Файл изображения', 'unknown')),
-                            "section": str(section_value),
-                            "section_id": str(section_value)
                         }
-                    )
-                    image_documents.append(doc)
-            except Exception as e:
-                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
-                continue
-        log_message(f"Создано {len(image_documents)} документов из изображений")
-        return image_documents
     except Exception as e:
-        log_message(f"Ошибка загрузки данных изображений: {str(e)}")
-        return []
-def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
-    log_message("Загружаю данные чанков из CSV")
-    try:
-        chunks_csv_path = hf_hub_download(
-            repo_id=repo_id,
-            filename=chunks_filename,
-            local_dir=download_dir,
-            repo_type="dataset",
-            token=hf_token
-        )
-        chunks_df = pd.read_csv(chunks_csv_path)
-        log_message(f"Загружено {len(chunks_df)} чанков из CSV")
-        text_column = None
-        for col in chunks_df.columns:
-            if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
-                text_column = col
-                break
-        if text_column is None:
-            text_column = chunks_df.columns[0]
-        log_message(f"Использую колонку: {text_column}")
-        documents = []
-        for i, (_, row) in enumerate(chunks_df.iterrows()):
-            doc = Document(
-                text=str(row[text_column]),
-                metadata={
-                    "chunk_id": row.get('chunk_id', i),
-                    "document_id": row.get('document_id', 'unknown'),
-                    "type": "text"
-                }
             )
-            documents.append(doc)
-        log_message(f"Создано {len(documents)} текстовых документов из CSV")
-        return documents, chunks_df
-    except Exception as e:
-        log_message(f"Ошибка загрузки CSV данных: {str(e)}")
-        return [], None

 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from llama_index.core.text_splitter import SentenceSplitter
+from my_logging import log_message
+# Configuration
+CHUNK_SIZE = 512
+CHUNK_OVERLAP = 128
+def chunk_text_documents(documents):
     text_splitter = SentenceSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP
     )
+    chunked = []
+    for doc in documents:
+        chunks = text_splitter.get_nodes_from_documents([doc])
+        for i, chunk in enumerate(chunks):
+            chunk.metadata.update({
+                'chunk_id': i,
+                'total_chunks': len(chunks),
+                'chunk_size': len(chunk.text)  # Add chunk size
+            })
+            chunked.append(chunk)
+    # Log statistics
+    if chunked:
+        avg_size = sum(len(c.text) for c in chunked) / len(chunked)
+        min_size = min(len(c.text) for c in chunked)
+        max_size = max(len(c.text) for c in chunked)
+        log_message(f"✓ Text: {len(documents)} docs → {len(chunked)} chunks")
+        log_message(f"  Size stats: avg={avg_size:.0f}, min={min_size}, max={max_size} chars")
+    return chunked
+def chunk_table_by_content(table_data, doc_id, max_chars=1200):
+    """Chunk tables by content size instead of rows"""
+    headers = table_data.get('headers', [])
+    rows = table_data.get('data', [])
+    table_num = table_data.get('table_number', 'unknown')
+    table_title = table_data.get('table_title', '')
+    section = table_data.get('section', '')
+    table_num_clean = str(table_num).strip()
+    # Create section-aware identifier
+    import re
+    if 'приложени' in section.lower():
+        appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
+        if appendix_match:
+            appendix_num = appendix_match.group(1).upper()
+            table_identifier = f"{table_num_clean} Приложение {appendix_num}"
         else:
+            table_identifier = table_num_clean
+    else:
+        table_identifier = table_num_clean
+    if not rows:
+        return []
+    log_message(f"  📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
+    # Calculate base metadata size (everything except row data)
+    base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
+    base_size = len(base_content)
+    available_space = max_chars - base_size - 200
+    # If entire table fits, return as one chunk
+    full_rows_content = format_table_rows(rows)
+    if base_size + len(full_rows_content) <= max_chars:
+        content = base_content + full_rows_content + format_table_footer(table_identifier, doc_id)
+        metadata = {
+            'type': 'table',
+            'document_id': doc_id,
+            'table_number': table_num_clean,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
+            'section': section,
+            'total_rows': len(rows),
+            'chunk_size': len(content),
+            'is_complete_table': True
+        }
+        log_message(f"    Single chunk: {len(content)} chars, {len(rows)} rows")
+        return [Document(text=content, metadata=metadata)]
+    # Otherwise, chunk by content size
+    chunks = []
+    current_rows = []
+    current_size = 0
+    chunk_num = 0
+    for i, row in enumerate(rows):
+        row_text = format_single_row(row, i + 1)
+        row_size = len(row_text)
+        # If adding this row exceeds limit, save current chunk
+        if current_size + row_size > available_space and current_rows:
+            content = base_content + format_table_rows(current_rows)
+            content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
+            content += format_table_footer(table_identifier, doc_id)
+            metadata = {
+                'type': 'table',
+                'document_id': doc_id,
+                'table_number': table_num_clean,
+                'table_identifier': table_identifier,
+                'table_title': table_title,
+                'section': section,
+                'chunk_id': chunk_num,
+                'row_start': current_rows[0]['_idx'] - 1,
+                'row_end': current_rows[-1]['_idx'],
+                'total_rows': len(rows),
+                'chunk_size': len(content),
+                'is_complete_table': False
+            }
+            chunks.append(Document(text=content, metadata=metadata))
+            log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
+            chunk_num += 1
+            current_rows = []
+            current_size = 0
+        # Add row index for tracking
+        row_copy = row.copy() if isinstance(row, dict) else {'data': row}
+        row_copy['_idx'] = i + 1
+        current_rows.append(row_copy)
+        current_size += row_size
+    # Add final chunk if rows remain
+    if current_rows:
+        content = base_content + format_table_rows(current_rows)
+        content += f"\n\nСтроки {current_rows[0]['_idx']}-{current_rows[-1]['_idx']} из {len(rows)}\n"
+        content += format_table_footer(table_identifier, doc_id)
+        metadata = {
+            'type': 'table',
+            'document_id': doc_id,
+            'table_number': table_num_clean,
+            'table_identifier': table_identifier,
+            'table_title': table_title,
+            'section': section,
+            'chunk_id': chunk_num,
+            'row_start': current_rows[0]['_idx'] - 1,
+            'row_end': current_rows[-1]['_idx'],
+            'total_rows': len(rows),
+            'chunk_size': len(content),
+            'is_complete_table': False
+        }
+        chunks.append(Document(text=content, metadata=metadata))
+        log_message(f"    Chunk {chunk_num + 1}: {len(content)} chars, {len(current_rows)} rows")
+    return chunks
+def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
+    """Format consistent table header"""
+    content = f"ДОКУМЕНТ: {doc_id}\n"
+    content += f"ТАБЛИЦА: {table_identifier}\n"
+    content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
+    content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
+    if table_title:
+        content += f"НАЗВАНИЕ: {table_title}\n"
+    if section:
+        content += f"РАЗДЕЛ: {section}\n"
+    content += f"{'='*70}\n\n"
+    # Enhanced search keywords
+    content += f"Это таблица {table_identifier} из документа {doc_id}. "
+    content += f"Идентификатор: {table_identifier}. Номер: {table_num}. Документ: {doc_id}. "
+    if section:
+        content += f"Раздел: {section}. "
+        if 'приложени' in section.lower():
+            content += f"Таблица из приложения. "
+    if table_title:
+        content += f"Название: {table_title}. "
+    content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
+    if headers:
+        header_str = ' | '.join(str(h) for h in headers)
+        content += f"ЗАГОЛОВКИ: {header_str}\n\n"
+    content += "ДАННЫЕ:\n"
+    return content
+def format_single_row(row, idx):
+    """Format a single row"""
+    if isinstance(row, dict):
+        parts = [f"{k}: {v}" for k, v in row.items()
+                if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
+        if parts:
+            return f"{idx}. {' | '.join(parts)}\n"
+    elif isinstance(row, list):
+        parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
+        if parts:
+            return f"{idx}. {' | '.join(parts)}\n"
+    return ""
+def format_table_rows(rows):
+    """Format multiple rows"""
+    content = ""
+    for row in rows:
+        idx = row.get('_idx', 0)
+        content += format_single_row(row, idx)
+    return content
+def format_table_footer(table_identifier, doc_id):
+    """Format table footer"""
+    return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
+def load_table_documents(repo_id, hf_token, table_dir):
+    log_message("Loading tables...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
+    all_chunks = []
+    for file_path in table_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
+            for sheet in data.get('sheets', []):
+                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
+                all_chunks.extend(chunks)
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
+    return all_chunks
+def load_json_documents(repo_id, hf_token, json_dir):
+    import zipfile
+    import tempfile
+    import os
+    log_message("Loading JSON documents...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
+    zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
+    log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
     documents = []
+    stats = {'success': 0, 'failed': 0, 'empty': 0}
+    for file_path in json_files:
+        try:
+            log_message(f"  Loading: {file_path}")
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            docs = extract_sections_from_json(local_path)
+            if docs:
+                documents.extend(docs)
+                stats['success'] += 1
+                log_message(f"    ✓ Extracted {len(docs)} sections")
+            else:
+                stats['empty'] += 1
+                log_message(f"    ⚠ No sections found")
+        except Exception as e:
+            stats['failed'] += 1
+            log_message(f"    ✗ Error: {e}")
+    for zip_path in zip_files:
+        try:
+            log_message(f"  Processing ZIP: {zip_path}")
+            local_zip = hf_hub_download(
+                repo_id=repo_id,
+                filename=zip_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with zipfile.ZipFile(local_zip, 'r') as zf:
+                json_files_in_zip = [f for f in zf.namelist()
+                                    if f.endswith('.json')
+                                    and not f.startswith('__MACOSX')
+                                    and not f.startswith('.')
+                                    and not '._' in f]
+                log_message(f"    Found {len(json_files_in_zip)} JSON files in ZIP")
+                for json_file in json_files_in_zip:
+                    try:
+                        file_content = zf.read(json_file)
+                        # Skip if file is too small
+                        if len(file_content) < 10:
+                            log_message(f"      ✗ Skipping: {json_file} (file too small)")
+                            stats['failed'] += 1
+                            continue
+                        # Try UTF-8 first (most common)
+                        try:
+                            text_content = file_content.decode('utf-8')
+                        except UnicodeDecodeError:
+                            try:
+                                text_content = file_content.decode('utf-8-sig')
+                            except UnicodeDecodeError:
+                                try:
+                                    # Try UTF-16 (the issue you're seeing)
+                                    text_content = file_content.decode('utf-16')
+                                except UnicodeDecodeError:
+                                    try:
+                                        text_content = file_content.decode('windows-1251')
+                                    except UnicodeDecodeError:
+                                        log_message(f"      ✗ Skipping: {json_file} (encoding failed)")
+                                        stats['failed'] += 1
+                                        continue
+                        # Validate JSON structure
+                        if not text_content.strip().startswith('{') and not text_content.strip().startswith('['):
+                            log_message(f"      ✗ Skipping: {json_file} (not valid JSON)")
+                            stats['failed'] += 1
+                            continue
+                        with tempfile.NamedTemporaryFile(mode='w', delete=False,
+                                                        suffix='.json', encoding='utf-8') as tmp:
+                            tmp.write(text_content)
+                            tmp_path = tmp.name
+                        docs = extract_sections_from_json(tmp_path)
+                        if docs:
+                            documents.extend(docs)
+                            stats['success'] += 1
+                            log_message(f"      ✓ {json_file}: {len(docs)} sections")
+                        else:
+                            stats['empty'] += 1
+                            log_message(f"      ⚠ {json_file}: No sections")
+                        os.unlink(tmp_path)
+                    except json.JSONDecodeError as e:
+                        stats['failed'] += 1
+                        log_message(f"      ✗ {json_file}: Invalid JSON")
+                    except Exception as e:
+                        stats['failed'] += 1
+                        log_message(f"      ✗ {json_file}: {str(e)[:100]}")
+        except Exception as e:
+            log_message(f"    ✗ Error with ZIP: {e}")
+    log_message(f"="*60)
+    log_message(f"JSON Loading Stats:")
+    log_message(f"  Success: {stats['success']}")
+    log_message(f"  Empty: {stats['empty']}")
+    log_message(f"  Failed: {stats['failed']}")
+    log_message(f"  Total sections: {len(documents)}")
+    log_message(f"="*60)
     return documents
+def extract_sections_from_json(json_path):
+    """Extract sections from a single JSON file"""
+    documents = []
     try:
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
+        # Extract all section levels
+        for section in data.get('sections', []):
+            if section.get('section_text', '').strip():
+                documents.append(Document(
+                    text=section['section_text'],
+                    metadata={
+                        'type': 'text',
+                        'document_id': doc_id,
+                        'section_id': section.get('section_id', '')
+                    }
+                ))
+            # Subsections
+            for subsection in section.get('subsections', []):
+                if subsection.get('subsection_text', '').strip():
+                    documents.append(Document(
+                        text=subsection['subsection_text'],
                         metadata={
+                            'type': 'text',
+                            'document_id': doc_id,
+                            'section_id': subsection.get('subsection_id', '')
                         }
+                    ))
+                # Sub-subsections
+                for sub_sub in subsection.get('sub_subsections', []):
+                    if sub_sub.get('sub_subsection_text', '').strip():
+                        documents.append(Document(
+                            text=sub_sub['sub_subsection_text'],
+                            metadata={
+                                'type': 'text',
+                                'document_id': doc_id,
+                                'section_id': sub_sub.get('sub_subsection_id', '')
+                            }
+                        ))
     except Exception as e:
+        log_message(f"Error extracting from {json_path}: {e}")
+    return documents
+def load_table_documents(repo_id, hf_token, table_dir):
+    """Load and chunk tables"""
+    log_message("Loading tables...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
+    all_chunks = []
+    for file_path in table_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
             )
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            # Extract file-level document_id
+            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
+            for sheet in data.get('sheets', []):
+                # Use sheet-level document_id if available, otherwise use file-level
+                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                # CRITICAL: Pass document_id to chunk function
+                chunks = chunk_table_by_content(sheet, sheet_doc_id)
+                all_chunks.extend(chunks)
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
+    return all_chunks
+def load_image_documents(repo_id, hf_token, image_dir):
+    """Load image descriptions"""
+    log_message("Loading images...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
+    documents = []
+    for file_path in csv_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            df = pd.read_csv(local_path)
+            for _, row in df.iterrows():
+                content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
+                content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
+                content += f"Название: {row.get('Название изображения', '')}\n"
+                content += f"Описание: {row.get('Описание изображение', '')}\n"
+                content += f"Раздел: {row.get('Раздел документа', '')}\n"
+                chunk_size = len(content)
+                documents.append(Document(
+                    text=content,
+                    metadata={
+                        'type': 'image',
+                        'document_id': str(row.get('Обозначение документа', 'unknown')),
+                        'image_number': str(row.get('№ Изображения', 'unknown')),
+                        'section': str(row.get('Раздел документа', '')),
+                        'chunk_size': chunk_size
+                    }
+                ))
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    if documents:
+        avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
+        log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
+    return documents
+def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
+    """Main loader - combines all document types"""
+    log_message("="*60)
+    log_message("STARTING DOCUMENT LOADING")
+    log_message("="*60)
+    # Load text sections
+    text_docs = load_json_documents(repo_id, hf_token, json_dir)
+    text_chunks = chunk_text_documents(text_docs)
+    # Load tables (already chunked)
+    table_chunks = load_table_documents(repo_id, hf_token, table_dir)
+    # Load images (no chunking needed)
+    image_docs = load_image_documents(repo_id, hf_token, image_dir)
+    all_docs = text_chunks + table_chunks + image_docs
+    log_message("="*60)
+    log_message(f"TOTAL DOCUMENTS: {len(all_docs)}")
+    log_message(f"  Text chunks: {len(text_chunks)}")
+    log_message(f"  Table chunks: {len(table_chunks)}")
+    log_message(f"  Images: {len(image_docs)}")
+    log_message("="*60)
+    return all_docs

table_prep.py CHANGED Viewed

@@ -32,8 +32,7 @@ def create_table_content(table_data):
 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
-def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
-    """Simple table chunking: max 5 rows or 2000 chars per chunk"""
     lines = doc.text.strip().split('\n')
@@ -63,18 +62,17 @@ def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
     current_size = len(header)
     for row in data_rows:
-        row_size = len(row) + 1  # +1 for newline
-        # Check if we need to create a new chunk
         if (len(current_rows) >= max_rows_per_chunk or
             current_size + row_size > max_chunk_size) and current_rows:
             # Save current chunk
             chunk_text = header + '\n'.join(current_rows)
             chunks.append(chunk_text)
-            # Start new chunk (keep last row for overlap)
             current_rows = [current_rows[-1]]
             current_size = len(header) + len(current_rows[0]) + 1
         current_rows.append(row)
@@ -147,7 +145,10 @@ def table_to_document(table_data, document_id=None):
         }
     )
     if len(content) > 2000:
         return chunk_table_document(base_doc)
     return [base_doc]
@@ -205,6 +206,7 @@ def load_table_data(repo_id, hf_token, table_data_dir):
                                     stats['total_size'] += size
                                     stats['by_document'][document_id]['count'] += 1
                                     stats['by_document'][document_id]['size'] += size
                         else:
                             docs_list = table_to_document(table_data, document_id)
                             table_documents.extend(docs_list)

 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+def chunk_table_document(doc, max_rows_per_chunk=3, max_chunk_size=2000):
     lines = doc.text.strip().split('\n')
     current_size = len(header)
     for row in data_rows:
+        row_size = len(row) + 1
         if (len(current_rows) >= max_rows_per_chunk or
             current_size + row_size > max_chunk_size) and current_rows:
             # Save current chunk
             chunk_text = header + '\n'.join(current_rows)
             chunks.append(chunk_text)
+            log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
             current_rows = [current_rows[-1]]
+            log_message(f"Перенос строки для перекрытия: {current_rows[-1]}")
             current_size = len(header) + len(current_rows[0]) + 1
         current_rows.append(row)
         }
     )
     if len(content) > 2000:
+        chunks = chunk_table_document(base_doc)
+        log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
         return chunk_table_document(base_doc)
     return [base_doc]
                                     stats['total_size'] += size
                                     stats['by_document'][document_id]['count'] += 1
                                     stats['by_document'][document_id]['size'] += size
+                                    log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
                         else:
                             docs_list = table_to_document(table_data, document_id)
                             table_documents.extend(docs_list)

utils.py CHANGED Viewed

@@ -43,99 +43,6 @@ def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingua
 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
     return CrossEncoder(model_name)
-def format_context_for_llm(nodes):
-    context_parts = []
-    for node in nodes:
-        metadata = node.metadata if hasattr(node, 'metadata') else {}
-        doc_id = metadata.get('document_id', 'Неизвестный документ')
-        section_info = ""
-        # Handle section information with proper hierarchy
-        if metadata.get('section_path'):
-            section_path = metadata['section_path']
-            section_text = metadata.get('section_text', '')
-            parent_section = metadata.get('parent_section', '')
-            parent_title = metadata.get('parent_title', '')
-            level = metadata.get('level', '')
-            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
-                # For subsections: раздел X (Title), пункт X.X
-                if section_text:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path} ({section_text})"
-                else:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_path}"
-            elif section_text:
-                # For main sections: раздел X (Title)
-                section_info = f"раздел {section_path} ({section_text})"
-            else:
-                section_info = f"раздел {section_path}"
-        elif metadata.get('section_id'):
-            section_id = metadata['section_id']
-            section_text = metadata.get('section_text', '')
-            level = metadata.get('level', '')
-            parent_section = metadata.get('parent_section', '')
-            parent_title = metadata.get('parent_title', '')
-            if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
-                if section_text:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id} ({section_text})"
-                else:
-                    section_info = f"раздел {parent_section} ({parent_title}), пункт {section_id}"
-            elif section_text:
-                section_info = f"раздел {section_id} ({section_text})"
-            else:
-                section_info = f"раздел {section_id}"
-        # Override with table/image info if applicable
-        if metadata.get('type') == 'table' and metadata.get('table_number'):
-            table_num = metadata['table_number']
-            if not str(table_num).startswith('№'):
-                table_num = f"№{table_num}"
-            table_title = metadata.get('table_title', '')
-            # Include section context for tables
-            base_section = ""
-            if metadata.get('section_path'):
-                base_section = f", раздел {metadata['section_path']}"
-            elif metadata.get('section_id'):
-                base_section = f", раздел {metadata['section_id']}"
-            if table_title:
-                section_info = f"Таблица {table_num} ({table_title}){base_section}"
-            else:
-                section_info = f"Таблица {table_num}{base_section}"
-        if metadata.get('type') == 'image' and metadata.get('image_number'):
-            image_num = metadata['image_number']
-            if not str(image_num).startswith('№'):
-                image_num = f"№{image_num}"
-            image_title = metadata.get('image_title', '')
-            # Include section context for images
-            base_section = ""
-            if metadata.get('section_path'):
-                base_section = f", раздел {metadata['section_path']}"
-            elif metadata.get('section_id'):
-                base_section = f", раздел {metadata['section_id']}"
-            if image_title:
-                section_info = f"Рисунок {image_num} ({image_title}){base_section}"
-            else:
-                section_info = f"Рисунок {image_num}{base_section}"
-        context_text = node.text if hasattr(node, 'text') else str(node)
-        if section_info:
-            formatted_context = f"[ИСТОЧНИК: {section_info}, документ {doc_id}]\n{context_text}\n"
-        else:
-            formatted_context = f"[ИСТОЧНИК: документ {doc_id}]\n{context_text}\n"
-        context_parts.append(formatted_context)
-    return "\n".join(context_parts)
 def generate_sources_html(nodes, chunks_df=None):
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"

 def get_reranker_model(model_name='cross-encoder/ms-marco-MiniLM-L-12-v2'):
     return CrossEncoder(model_name)
 def generate_sources_html(nodes, chunks_df=None):
     html = "<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; max-height: 400px; overflow-y: auto;'>"
     html += "<h3 style='color: #63b3ed; margin-top: 0;'>Источники:</h3>"