Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

File size: 13,308 Bytes

from collections import defaultdict
import json
from huggingface_hub import hf_hub_download, list_repo_files
from llama_index.core import Document
from my_logging import log_message

def create_table_content(table_data):
    """Create formatted content from table data"""
    doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
    table_num = table_data.get('table_number', 'Неизвестно')
    table_title = table_data.get('table_title', 'Неизвестно')
    section = table_data.get('section', 'Неизвестно')
    
    content = f"Таблица: {table_num}\n"
    content += f"Название: {table_title}\n"
    content += f"Документ: {doc_id}\n"
    content += f"Раздел: {section}\n"
    
    headers = table_data.get('headers', [])
    if headers:
        content += f"\nЗаголовки: {' | '.join(headers)}\n"
    
    if 'data' in table_data and isinstance(table_data['data'], list):
        content += "\nДанные таблицы:\n"
        for row_idx, row in enumerate(table_data['data'], start=1):
            if isinstance(row, dict):
                row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
                content += f"Строка {row_idx}: {row_text}\n"
    
    return content

from llama_index.core.text_splitter import SentenceSplitter
from config import CHUNK_SIZE, CHUNK_OVERLAP

def extract_table_metadata(table_text: str) -> dict:
    words = table_text.split()
    unique_words = set(words)

    from collections import Counter
    stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
    filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
    common = Counter(filtered).most_common(15)
    key_terms = [w for w, _ in common]

    return {
        "summary": f"Таблица содержит около {len(words)} слов и {len(unique_words)} уникальных терминов.",
        "materials": [],   # if you want to extract material names, hook in regex or LLM here
        "key_terms": key_terms
    }

def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
    if chunk_size is None:
        chunk_size = CHUNK_SIZE
    if chunk_overlap is None:
        chunk_overlap = CHUNK_OVERLAP
    
    # Extract critical metadata from table before chunking
    table_metadata = extract_table_metadata(doc.text)
    table_num = doc.metadata.get('table_number', 'unknown')
    table_title = doc.metadata.get('table_title', 'unknown')
    doc_id = doc.metadata.get('document_id', 'unknown')
    section = doc.metadata.get('section', 'unknown')
    
    # Parse table structure from your create_table_content format
    lines = doc.text.strip().split('\n')
    
    # Find where data rows start
    table_header_lines = []
    data_rows = []
    in_data = False
    
    for line in lines:
        if line.startswith('Данные таблицы:'):
            in_data = True
            table_header_lines.append(line)
        elif in_data and line.startswith('Строка'):
            data_rows.append(line)
        elif not in_data:
            table_header_lines.append(line)
    
    table_header = '\n'.join(table_header_lines) + '\n'
    
    if not data_rows:
        log_message(f"  ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
        text_splitter = SentenceSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separator="\n"
        )
        text_chunks = text_splitter.split_text(doc.text)
        log_message(f"  📊 Стандартное разбиение: {len(text_chunks)} чанков")
    else:
        # Row-based chunking
        log_message(f"  📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
        
        header_size = len(table_header)
        # Reserve space for enrichment prefix
        available_size = chunk_size - header_size - 300
        
        text_chunks = []
        current_chunk_rows = []
        current_size = 0
        
        for row in data_rows:
            row_size = len(row) + 1
            
            # Check if adding this row exceeds limit
            if current_size + row_size > available_size and current_chunk_rows:
                # Create chunk
                chunk_text = table_header + '\n'.join(current_chunk_rows)
                text_chunks.append(chunk_text)
                log_message(f"    ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
                
                # Overlap: keep last 2 rows
                overlap_count = min(2, len(current_chunk_rows))
                current_chunk_rows = current_chunk_rows[-overlap_count:]
                current_size = sum(len(r) + 1 for r in current_chunk_rows)
            
            current_chunk_rows.append(row)
            current_size += row_size
        
        # Final chunk
        if current_chunk_rows:
            chunk_text = table_header + '\n'.join(current_chunk_rows)
            text_chunks.append(chunk_text)
            log_message(f"    ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
    
    log_message(f"  📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
    
    # Create enriched chunks
    chunked_docs = []
    materials = table_metadata.get("materials", [])
    key_terms = table_metadata.get("key_terms", [])
    
    for i, chunk_text in enumerate(text_chunks):
        chunk_metadata = doc.metadata.copy()
        chunk_metadata.update({
            "chunk_id": i,
            "total_chunks": len(text_chunks),
            "chunk_size": len(chunk_text),
            "is_chunked": True,
            "materials": materials,
            "key_terms": key_terms,
            "table_summary": table_metadata.get("summary", "")
        })
        
        # Enrichment prefix
        materials_str = ', '.join(materials[:10]) if materials else 'нет'
        terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
        
        enriched_text = f"""[Таблица {table_num}: {table_title}]
[Материалы в таблице: {materials_str}]
[Ключевые термины: {terms_str}]

{chunk_text}"""
        
        log_message(f"    ✓ Чанк {i+1}/{len(text_chunks)}: "
                   f"размер={len(enriched_text)}, "
                   f"материалов={len(materials)}, "
                   f"терминов={len(key_terms)}")
        
        chunked_doc = Document(
            text=enriched_text,
            metadata=chunk_metadata
        )
        chunked_docs.append(chunked_doc)
    
    return chunked_docs

def table_to_document(table_data, document_id=None):
    if not isinstance(table_data, dict):
        log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
        return []
    
    doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
    table_num = table_data.get('table_number', 'Неизвестно')
    table_title = table_data.get('table_title', 'Неизвестно')
    section = table_data.get('section', 'Неизвестно')
    
    table_rows = table_data.get('data', [])
    if not table_rows or len(table_rows) == 0:
        log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
        return []
    
    content = create_table_content(table_data)
    content_size = len(content)
    row_count = len(table_rows)
    
    base_doc = Document(
        text=content,
        metadata={
            "type": "table",
            "table_number": table_num,
            "table_title": table_title,
            "document_id": doc_id,
            "section": section,
            "section_id": section,
            "total_rows": row_count,
            "content_size": content_size
        }
    )
    
    if content_size > CHUNK_SIZE:
        log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
                   f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
        chunked_docs = chunk_table_document(base_doc)
        log_message(f"  ✂️ Разделена на {len(chunked_docs)} чанков")
        for i, chunk_doc in enumerate(chunked_docs):
            log_message(f"    Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
        return chunked_docs
    else:
        log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
                   f"Размер: {content_size} символов | Строк: {row_count}")
        return [base_doc]
    
    
def load_table_data(repo_id, hf_token, table_data_dir):
    log_message("=" * 60)
    log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
    log_message("=" * 60)
    
    try:
        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
        
        log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
        
        table_documents = []
        stats = {
            'total_tables': 0,
            'total_size': 0,
            'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
        }
        
        for file_path in table_files:
            try:
                local_path = hf_hub_download(
                    repo_id=repo_id,
                    filename=file_path,
                    local_dir='',
                    repo_type="dataset",
                    token=hf_token
                )
                
                log_message(f"\nОбработка файла: {file_path}")
                
                with open(local_path, 'r', encoding='utf-8') as f:
                    table_data = json.load(f)
                    
                    if isinstance(table_data, dict):
                        document_id = table_data.get('document', 'unknown')
                        
                        if 'sheets' in table_data:
                            sorted_sheets = sorted(
                                table_data['sheets'],
                                key=lambda sheet: sheet.get('table_number', '')  # or use 'table_number'
                            )
                            
                            for sheet in sorted_sheets:
                                sheet['document'] = document_id
                                docs_list = table_to_document(sheet, document_id)
                                table_documents.extend(docs_list)
                                
                                for doc in docs_list:
                                    stats['total_tables'] += 1
                                    size = doc.metadata.get('content_size', 0)
                                    stats['total_size'] += size
                                    stats['by_document'][document_id]['count'] += 1
                                    stats['by_document'][document_id]['size'] += size
                        else:
                            docs_list = table_to_document(table_data, document_id)
                            table_documents.extend(docs_list)
                            
                            for doc in docs_list:
                                stats['total_tables'] += 1
                                size = doc.metadata.get('content_size', 0)
                                stats['total_size'] += size
                                stats['by_document'][document_id]['count'] += 1
                                stats['by_document'][document_id]['size'] += size
                                
                        
            except Exception as e:
                log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
                continue
        
        # Log summary statistics
        log_message("\n" + "=" * 60)
        log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
        log_message("=" * 60)
        log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
        log_message(f"Общий размер: {stats['total_size']:,} символов")
        log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
        
        log_message("\nПо документам:")
        for doc_id, doc_stats in sorted(stats['by_document'].items()):
            log_message(f"  • {doc_id}: {doc_stats['count']} таблиц, "
                       f"{doc_stats['size']:,} символов")
        
        log_message("=" * 60)
        
        return table_documents
        
    except Exception as e:
        log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
        return []