Spaces:
Sleeping
Sleeping
File size: 6,889 Bytes
5884230 0067c9d bf93cc0 0067c9d 5884230 0067c9d 5884230 0067c9d 5884230 0067c9d 5884230 0067c9d 5884230 6562b97 c81fd8c bf93cc0 c81fd8c bf93cc0 0067c9d bf93cc0 0067c9d bf93cc0 c81fd8c 0067c9d bf93cc0 c81fd8c 5884230 0067c9d 5884230 c81fd8c 5884230 0067c9d 6562b97 0067c9d 5884230 0067c9d 5884230 17d0013 26c7bb5 17d0013 5884230 0067c9d 5884230 0067c9d 5884230 0067c9d 5884230 0067c9d 7dcc6c5 0067c9d 6562b97 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | from collections import defaultdict
import json
from huggingface_hub import hf_hub_download, list_repo_files
from llama_index.core import Document
from my_logging import log_message
def create_table_content(table_data):
"""Create formatted content from table data"""
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
table_num = table_data.get('table_number', 'Неизвестно')
table_title = table_data.get('table_title', 'Неизвестно')
section = table_data.get('section', 'Неизвестно')
content = f"Таблица: {table_num}\n"
content += f"Название: {table_title}\n"
content += f"Документ: {doc_id}\n"
content += f"Раздел: {section}\n"
headers = table_data.get('headers', [])
if headers:
content += f"\nЗаголовки: {' | '.join(headers)}\n"
if 'data' in table_data and isinstance(table_data['data'], list):
content += "\nДанные таблицы:\n"
for row_idx, row in enumerate(table_data['data'], start=1):
if isinstance(row, dict):
row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
content += f"Строка {row_idx}: {row_text}\n"
return content
def table_to_document(table_data, document_id=None):
"""Convert table data to a single Document"""
if not isinstance(table_data, dict):
return []
doc_id = document_id or table_data.get('document_id', table_data.get('document', 'Неизвестно'))
table_num = table_data.get('table_number', 'Неизвестно')
table_title = table_data.get('table_title', 'Неизвестно')
section = table_data.get('section', 'Неизвестно')
content = create_table_content(table_data)
content_size = len(content)
# Log table addition
row_count = len(table_data.get('data', [])) if 'data' in table_data else 0
log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
f"Размер: {content_size} символов | Строк: {row_count}")
return [Document(
text=content,
metadata={
"type": "table",
"table_number": table_num,
"table_title": table_title,
"document_id": doc_id,
"section": section,
"section_id": section,
"total_rows": row_count,
"content_size": content_size
}
)]
def load_table_data(repo_id, hf_token, table_data_dir):
log_message("=" * 60)
log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
log_message("=" * 60)
try:
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
table_documents = []
stats = {
'total_tables': 0,
'total_size': 0,
'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
}
for file_path in table_files:
try:
local_path = hf_hub_download(
repo_id=repo_id,
filename=file_path,
local_dir='',
repo_type="dataset",
token=hf_token
)
log_message(f"\nОбработка файла: {file_path}")
with open(local_path, 'r', encoding='utf-8') as f:
table_data = json.load(f)
if isinstance(table_data, dict):
document_id = table_data.get('document', 'unknown')
if 'sheets' in table_data:
sorted_sheets = sorted(
table_data['sheets'],
key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
)
for sheet in sorted_sheets:
sheet['document'] = document_id
docs_list = table_to_document(sheet, document_id)
table_documents.extend(docs_list)
for doc in docs_list:
stats['total_tables'] += 1
size = doc.metadata.get('content_size', 0)
stats['total_size'] += size
stats['by_document'][document_id]['count'] += 1
stats['by_document'][document_id]['size'] += size
else:
docs_list = table_to_document(table_data, document_id)
table_documents.extend(docs_list)
for doc in docs_list:
stats['total_tables'] += 1
size = doc.metadata.get('content_size', 0)
stats['total_size'] += size
stats['by_document'][document_id]['count'] += 1
stats['by_document'][document_id]['size'] += size
except Exception as e:
log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
continue
# Log summary statistics
log_message("\n" + "=" * 60)
log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
log_message("=" * 60)
log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
log_message(f"Общий размер: {stats['total_size']:,} символов")
log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
log_message("\nПо документам:")
for doc_id, doc_stats in sorted(stats['by_document'].items()):
log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
f"{doc_stats['size']:,} символов")
log_message("=" * 60)
return table_documents
except Exception as e:
log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
return []
|