from collections import defaultdict import json from huggingface_hub import hf_hub_download, list_repo_files from llama_index.core import Document from my_logging import log_message from config import MAX_CHARS_TABLE, MAX_ROWS_TABLE def create_table_content(table_data): doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно')) table_num = table_data.get('table_number', 'Неизвестно') table_title = table_data.get('table_title', 'Неизвестно') section = table_data.get('section', 'Неизвестно') content = f"Таблица: {table_num}\n" content += f"Название: {table_title}\n" content += f"Документ: {doc_id}\n" content += f"Раздел: {section}\n" headers = table_data.get('headers', []) if headers: content += f"\nЗаголовки: {' | '.join(headers)}\n" if 'data' in table_data and isinstance(table_data['data'], list): content += "\nДанные таблицы:\n" for row_idx, row in enumerate(table_data['data'], start=1): if isinstance(row, dict): row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v]) content += f"Строка {row_idx}: {row_text}\n" return content def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk=MAX_ROWS_TABLE): lines = doc.text.strip().split('\n') header_lines = [] data_rows = [] in_data = False for line in lines: if line.startswith('Данные таблицы:'): in_data = True header_lines.append(line) elif in_data and line.startswith('Строка'): data_rows.append(line) elif not in_data: header_lines.append(line) header = '\n'.join(header_lines) + '\n' if not data_rows: return [doc] chunks = [] current_rows = [] current_size = len(header) for row in data_rows: row_size = len(row) + 1 # Check both limits: chunk size and row count if ((current_size + row_size > max_chunk_size or len(current_rows) >= max_rows_per_chunk) and current_rows): chunk_text = header + '\n'.join(current_rows) chunks.append(chunk_text) log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками") current_rows = [] current_size = len(header) current_rows.append(row) current_size += row_size log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов") # Add final chunk if current_rows: chunk_text = header + '\n'.join(current_rows) chunks.append(chunk_text) log_message(f"Создана финальная часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками") # Create Document objects chunked_docs = [] for i, chunk_text in enumerate(chunks): chunk_doc = Document( text=chunk_text, metadata={ "type": "table", "table_number": doc.metadata.get('table_number'), "document_id": doc.metadata.get('document_id'), "section": doc.metadata.get('section'), "chunk_id": i, "total_chunks": len(chunks), "is_chunked": True } ) chunked_docs.append(chunk_doc) return chunked_docs # def table_to_document(table_data, document_id=None): # if not isinstance(table_data, dict): # return [] # doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно') # table_num = table_data.get('table_number', 'Неизвестно') # table_title = table_data.get('table_title', 'Неизвестно') # section = table_data.get('section', 'Неизвестно') # table_rows = table_data.get('data', []) # if not table_rows: # return [] # # Build table content # content = f"Таблица: {table_num}\n" # content += f"Название: {table_title}\n" # content += f"Документ: {doc_id}\n" # content += f"Раздел: {section}\n" # headers = table_data.get('headers', []) # if headers: # content += f"\nЗаголовки: {' | '.join(headers)}\n" # content += "\nДанные таблицы:\n" # for row_idx, row in enumerate(table_rows, start=1): # if isinstance(row, dict): # row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v]) # content += f"Строка {row_idx}: {row_text}\n" # # Create base document # base_doc = Document( # text=content, # metadata={ # "type": "table", # "table_number": table_num, # "document_id": doc_id, # "section": section # } # ) # if len(content) > 4000: # chunks = chunk_table_document(base_doc) # log_message(f"Таблица {table_num} разбита на {len(chunks)} частей") # return chunk_table_document(base_doc) # return [base_doc] # def load_table_data(repo_id, hf_token, table_data_dir): # try: # files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token) # table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')] # log_message(f"Найдено {len(table_files)} JSON файлов с таблицами") # table_documents = [] # stats = { # 'total_tables': 0, # 'total_size': 0, # 'by_document': defaultdict(lambda: {'count': 0, 'size': 0}) # } # for file_path in table_files: # try: # local_path = hf_hub_download( # repo_id=repo_id, # filename=file_path, # local_dir='', # repo_type="dataset", # token=hf_token # ) # log_message(f"\nОбработка файла: {file_path}") # with open(local_path, 'r', encoding='utf-8') as f: # table_data = json.load(f) # if isinstance(table_data, dict): # document_id = table_data.get('document', 'unknown') # if 'sheets' in table_data: # sorted_sheets = sorted( # table_data['sheets'], # key=lambda sheet: sheet.get('table_number', '') # or use 'table_number' # ) # for sheet in sorted_sheets: # sheet['document'] = document_id # docs_list = table_to_document(sheet, document_id) # table_documents.extend(docs_list) # for doc in docs_list: # stats['total_tables'] += 1 # size = doc.metadata.get('content_size', 0) # stats['total_size'] += size # stats['by_document'][document_id]['count'] += 1 # stats['by_document'][document_id]['size'] += size # log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов") # else: # docs_list = table_to_document(table_data, document_id) # table_documents.extend(docs_list) # for doc in docs_list: # stats['total_tables'] += 1 # size = doc.metadata.get('content_size', 0) # stats['total_size'] += size # stats['by_document'][document_id]['count'] += 1 # stats['by_document'][document_id]['size'] += size # except Exception as e: # log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}") # continue # # Log summary statistics # log_message("\n" + "=" * 60) # log_message("СТАТИСТИКА ПО ТАБЛИЦАМ") # log_message("=" * 60) # log_message(f"Всего таблиц добавлено: {stats['total_tables']}") # log_message(f"Общий размер: {stats['total_size']:,} символов") # log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов") # log_message("\nПо документам:") # for doc_id, doc_stats in sorted(stats['by_document'].items()): # log_message(f" • {doc_id}: {doc_stats['count']} таблиц, " # f"{doc_stats['size']:,} символов") # log_message("=" * 60) # return table_documents # except Exception as e: # log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}") # return []