Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

RAG_AIEXP_01 / converters /converter.py

MrSimple07

added the file stats info to the UI

4973a7c 5 months ago

raw

history blame contribute delete

10.1 kB

	from config import *
	from my_logging import log_message
	import json
	import pandas as pd
	import os

	def process_uploaded_file(file, file_type):
	"""Обработка загруженного файла и добавление в систему"""
	try:
	if file is None:
	return "❌ Файл не выбран"

	from huggingface_hub import HfApi
	import tempfile
	import shutil

	with tempfile.TemporaryDirectory() as temp_dir:
	source_path = file if isinstance(file, str) else file.name
	filename = os.path.basename(source_path)
	file_path = os.path.join(temp_dir, filename)

	log_message(f"Начало обработки файла: {filename}")
	log_message(f"Тип документа: {file_type}")

	if os.path.abspath(source_path) != os.path.abspath(file_path):
	shutil.copy(source_path, file_path)
	else:
	file_path = source_path
	original_size_bytes = os.path.getsize(file_path)
	original_size_mb = original_size_bytes / (1024 * 1024)

	status_info = []
	status_info.append(f"📁 Исходный файл: {filename}")
	status_info.append(f"📦 Размер файла: {original_size_mb:.2f} МБ ({original_size_bytes:,} байт)")

	if file_type == "Таблица":
	target_dir = TABLE_DATA_DIR
	if filename.endswith(('.xlsx', '.xls')):
	json_path = convert_single_excel_to_json(file_path, temp_dir)
	upload_file = json_path

	# Get processed file size
	processed_size_bytes = os.path.getsize(json_path)
	processed_size_mb = processed_size_bytes / (1024 * 1024)

	with open(json_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	total_rows = sum(len(sheet['data']) for sheet in data['sheets'])

	status_info.append(f"📊 Всего таблиц: {len(data['sheets'])}")
	status_info.append(f"📄 Листов в документе: {data['total_sheets']}")
	status_info.append(f"📝 Всего строк данных: {total_rows:,}")
	status_info.append(f"💾 Размер после обработки: {processed_size_mb:.2f} МБ")
	status_info.append(f"📤 Загружен как: {os.path.basename(json_path)}")
	else:
	upload_file = file_path
	status_info.append(f"📤 Загружен как: {filename}")

	elif file_type == "Изображение (метаданные)":
	target_dir = IMAGE_DATA_DIR
	if filename.endswith(('.xlsx', '.xls')):
	csv_path = convert_single_excel_to_csv(file_path, temp_dir)
	upload_file = csv_path

	# Get processed file size
	processed_size_bytes = os.path.getsize(csv_path)
	processed_size_mb = processed_size_bytes / (1024 * 1024)

	df = pd.read_csv(csv_path)
	status_info.append(f"🖼️ Записей изображений: {len(df):,}")
	status_info.append(f"📋 Колонок метаданных: {len(df.columns)}")
	status_info.append(f"💾 Размер после обработки: {processed_size_mb:.2f} МБ")
	status_info.append(f"📤 Загружен как: {os.path.basename(csv_path)}")
	else:
	upload_file = file_path
	try:
	df = pd.read_csv(upload_file)
	status_info.append(f"🖼️ Записей изображений: {len(df):,}")
	status_info.append(f"📋 Колонок метаданных: {len(df.columns)}")
	except:
	pass
	status_info.append(f"📤 Загружен как: {filename}")

	else: # JSON документ
	target_dir = JSON_FILES_DIR
	upload_file = file_path

	try:
	with open(upload_file, 'r', encoding='utf-8') as f:
	json_data = json.load(f)

	if isinstance(json_data, list):
	status_info.append(f"📝 Документов в JSON: {len(json_data):,}")
	elif isinstance(json_data, dict):
	status_info.append(f"📝 JSON объект (словарь)")
	# Count keys if it's structured data
	if 'sheets' in json_data:
	status_info.append(f"📊 Таблиц в документе: {len(json_data.get('sheets', []))}")
	except:
	pass
	status_info.append(f"📤 Загружен как: {filename}")

	# Загружаем на HuggingFace
	log_message(f"Загрузка на HuggingFace: {target_dir}/{os.path.basename(upload_file)}")
	api = HfApi()
	api.upload_file(
	path_or_fileobj=upload_file,
	path_in_repo=f"{target_dir}/{os.path.basename(upload_file)}",
	repo_id=HF_REPO_ID,
	token=HF_TOKEN,
	repo_type="dataset"
	)

	log_message(f"Файл {filename} успешно загружен в {target_dir}")

	result_message = f"✅ Файл успешно загружен и обработан\n\n"
	result_message += "\n".join(status_info)
	result_message += "\n\n⚠️ Нажмите кнопку 'Перезапустить систему' для применения изменений"

	return result_message

	except Exception as e:
	error_msg = f"Ошибка обработки файла: {str(e)}"
	log_message(error_msg)
	return f"❌ {error_msg}"

	def convert_single_excel_to_json(excel_path, output_dir):
	"""Конвертация одного Excel файла в JSON для таблиц"""
	df_dict = pd.read_excel(excel_path, sheet_name=None)

	result = {
	"document": os.path.basename(excel_path),
	"total_sheets": len(df_dict),
	"sheets": []
	}

	log_message(f"Обработка файла: {os.path.basename(excel_path)}")
	log_message(f"Найдено листов: {len(df_dict)}")

	total_tables = 0
	for sheet_name, df in df_dict.items():
	if df.empty or "Номер таблицы" not in df.columns:
	log_message(f" Лист '{sheet_name}': пропущен (пустой или отсутствует колонка 'Номер таблицы')")
	continue

	df = df.dropna(how='all').fillna("")
	grouped = df.groupby("Номер таблицы")
	sheet_tables = 0

	for table_number, group in grouped:
	group = group.reset_index(drop=True)

	sheet_data = {
	"sheet_name": sheet_name,
	"document_id": str(group.iloc[0].get("Обозначение документа", "")),
	"section": str(group.iloc[0].get("Раздел документа", "")),
	"table_number": str(table_number),
	"table_title": str(group.iloc[0].get("Название таблицы", "")),
	"table_description": str(group.iloc[0].get("Примечание", "")),
	"headers": [col for col in df.columns if col not in
	["Обозначение документа", "Раздел документа", "Номер таблицы",
	"Название таблицы", "Примечание"]],
	"data": []
	}

	for _, row in group.iterrows():
	row_dict = {col: str(row[col]) if pd.notna(row[col]) else ""
	for col in sheet_data["headers"]}
	sheet_data["data"].append(row_dict)

	result["sheets"].append(sheet_data)
	sheet_tables += 1

	total_tables += sheet_tables
	log_message(f" Лист '{sheet_name}': обработано таблиц: {sheet_tables}")

	json_filename = os.path.basename(excel_path).replace('.xlsx', '.json').replace('.xls', '.json')
	json_path = os.path.join(output_dir, json_filename)

	with open(json_path, 'w', encoding='utf-8') as f:
	json.dump(result, f, ensure_ascii=False, indent=2)

	log_message(f"Конвертация завершена. Всего таблиц обработано: {total_tables}")
	log_message(f"Результат сохранен: {json_filename}")

	return json_path

	def convert_single_excel_to_csv(excel_path, output_dir):
	"""Конвертация одного Excel файла в CSV для изображений"""
	log_message(f"Конвертация Excel в CSV: {os.path.basename(excel_path)}")

	df = pd.read_excel(excel_path)
	csv_filename = os.path.basename(excel_path).replace('.xlsx', '.csv').replace('.xls', '.csv')
	csv_path = os.path.join(output_dir, csv_filename)
	df.to_csv(csv_path, index=False, encoding='utf-8')

	log_message(f" Строк обработано: {len(df)}")
	log_message(f" Колонок: {len(df.columns)}")
	log_message(f" Результат сохранен: {csv_filename}")

	return csv_path