from collections import defaultdict import json from huggingface_hub import hf_hub_download, list_repo_files from llama_index.core import Document from my_logging import log_message def create_table_content(table_data): """Create formatted content from table data""" doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно')) table_num = table_data.get('table_number', 'Неизвестно') table_title = table_data.get('table_title', 'Неизвестно') section = table_data.get('section', 'Неизвестно') content = f"Таблица: {table_num}\n" content += f"Название: {table_title}\n" content += f"Документ: {doc_id}\n" content += f"Раздел: {section}\n" headers = table_data.get('headers', []) if headers: content += f"\nЗаголовки: {' | '.join(headers)}\n" if 'data' in table_data and isinstance(table_data['data'], list): content += "\nДанные таблицы:\n" for row_idx, row in enumerate(table_data['data'], start=1): if isinstance(row, dict): row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v]) content += f"Строка {row_idx}: {row_text}\n" return content def table_to_document(table_data, document_id=None): """Convert table data to a single Document""" if not isinstance(table_data, dict): return [] doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно')) table_num = table_data.get('table_number', 'Неизвестно') table_title = table_data.get('table_title', 'Неизвестно') section = table_data.get('section', 'Неизвестно') content = create_table_content(table_data) content_size = len(content) # Log table addition row_count = len(table_data.get('data', [])) if 'data' in table_data else 0 log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | " f"Размер: {content_size} символов | Строк: {row_count}") return [Document( text=content, metadata={ "type": "table", "table_number": table_num, "table_title": table_title, "document_id": doc_id, "section": section, "section_id": section, "total_rows": row_count, "content_size": content_size } )] def load_table_data(repo_id, hf_token, table_data_dir): log_message("=" * 60) log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ") log_message("=" * 60) try: files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token) table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')] log_message(f"Найдено {len(table_files)} JSON файлов с таблицами") table_documents = [] stats = { 'total_tables': 0, 'total_size': 0, 'by_document': defaultdict(lambda: {'count': 0, 'size': 0}) } for file_path in table_files: try: local_path = hf_hub_download( repo_id=repo_id, filename=file_path, local_dir='', repo_type="dataset", token=hf_token ) log_message(f"\nОбработка файла: {file_path}") with open(local_path, 'r', encoding='utf-8') as f: table_data = json.load(f) if isinstance(table_data, dict): document_id = table_data.get('document', 'unknown') if 'sheets' in table_data: sorted_sheets = sorted( table_data['sheets'], key=lambda sheet: sheet.get('table_number', '') # or use 'table_number' ) for sheet in sorted_sheets: sheet['document'] = document_id docs_list = table_to_document(sheet, document_id) table_documents.extend(docs_list) for doc in docs_list: stats['total_tables'] += 1 size = doc.metadata.get('content_size', 0) stats['total_size'] += size stats['by_document'][document_id]['count'] += 1 stats['by_document'][document_id]['size'] += size else: docs_list = table_to_document(table_data, document_id) table_documents.extend(docs_list) for doc in docs_list: stats['total_tables'] += 1 size = doc.metadata.get('content_size', 0) stats['total_size'] += size stats['by_document'][document_id]['count'] += 1 stats['by_document'][document_id]['size'] += size except Exception as e: log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}") continue # Log summary statistics log_message("\n" + "=" * 60) log_message("СТАТИСТИКА ПО ТАБЛИЦАМ") log_message("=" * 60) log_message(f"Всего таблиц добавлено: {stats['total_tables']}") log_message(f"Общий размер: {stats['total_size']:,} символов") log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов") log_message("\nПо документам:") for doc_id, doc_stats in sorted(stats['by_document'].items()): log_message(f" • {doc_id}: {doc_stats['count']} таблиц, " f"{doc_stats['size']:,} символов") log_message("=" * 60) return table_documents except Exception as e: log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}") return []