Spaces:

AlserFurma
/

Yapi

Build error

App Files Files Community

Yapi / app.py

AlserFurma

Update app.py

f7d34b8 verified 5 months ago

raw

history blame contribute delete

6.76 kB

	import gradio as gr
	import os
	from PIL import Image
	import tempfile
	from gradio_client import Client, handle_file
	import torch
	from transformers import VitsModel, AutoTokenizer, pipeline
	import scipy.io.wavfile as wavfile
	import traceback


	# =========================
	# Загрузка моделей
	# =========================

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	try:
	# TTS модель (казахский)
	tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
	tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")

	# Перевод ru -> kk
	translator = pipeline(
	"translation",
	model="facebook/nllb-200-distilled-600M",
	device=0 if device == "cuda" else -1
	)

	print("✅ Все модели успешно загружены!")

	except Exception as e:
	raise RuntimeError(f"❌ Ошибка загрузки моделей: {str(e)}")


	# =========================
	# Talking Head API
	# =========================

	TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"


	# =========================
	# Основная функция
	# =========================

	def inference(image: Image.Image, text: str):

	error_msg = ""
	video_path = None
	audio_path = None
	img_path = None

	try:
	# =========================
	# Проверка входных данных
	# =========================
	if image is None:
	raise ValueError("Загрузите изображение лектора!")

	if not text or not text.strip():
	raise ValueError("Введите текст лекции!")

	if len(text) > 500:
	raise ValueError("Текст превышает 500 символов!")

	print("📥 Ввод (RU):", text)

	# =========================
	# Шаг 1 — Перевод
	# =========================
	translation = translator(
	text,
	src_lang="rus_Cyrl",
	tgt_lang="kaz_Cyrl"
	)

	translated_text = translation[0]["translation_text"]
	print("🌍 Перевод (KK):", translated_text)

	if not translated_text.strip():
	raise ValueError("Перевод не удался!")

	# =========================
	# Шаг 2 — Озвучка
	# =========================
	inputs = tts_tokenizer(translated_text, return_tensors="pt").to(device)

	with torch.no_grad():
	output = tts_model(**inputs)

	waveform = output.waveform.squeeze().cpu().numpy()

	if waveform.size == 0:
	raise ValueError("TTS вернул пустое аудио!")

	audio = (waveform * 32767).astype("int16")
	sampling_rate = tts_model.config.sampling_rate

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	wavfile.write(f.name, sampling_rate, audio)
	audio_path = f.name

	print("🔊 Аудио создано:", audio_path)

	# =========================
	# Шаг 3 — Сохранение фото
	# =========================
	with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
	if image.mode != "RGB":
	image = image.convert("RGB")
	image.save(f.name)
	img_path = f.name

	print("🖼 Фото сохранено:", img_path)

	# =========================
	# Шаг 4 — Генерация видео
	# =========================
	print("🎥 Подключение к SkyReels...")
	client = Client(TALKING_HEAD_SPACE)

	result = client.predict(
	image_path=handle_file(img_path),
	audio_path=handle_file(audio_path),
	guidance_scale=3.0,
	steps=10,
	api_name="/process_image_audio"
	)

	print("✅ RAW RESULT:", result)

	# =========================
	# Универсальный разбор результата
	# =========================

	if isinstance(result, tuple) and len(result) > 0:
	video_data = result[0]
	elif isinstance(result, dict):
	video_data = result
	else:
	raise ValueError(f"Неизвестный формат ответа API: {type(result)}")

	if isinstance(video_data, dict):
	video_path = (
	video_data.get("video")
	or video_data.get("path")
	or video_data.get("file")
	)

	elif isinstance(video_data, str):
	video_path = video_data

	else:
	raise ValueError(f"Не удалось извлечь видео: {type(video_data)}")

	if not video_path:
	raise ValueError("API не вернул путь к видео!")

	print("✅ Видео создано:", video_path)
	error_msg = "✅ Видео успешно создано!"

	except Exception as e:
	error_msg = f"❌ Ошибка: {str(e)}"
	print(error_msg)
	traceback.print_exc()

	finally:
	# =========================
	# Очистка временных файлов
	# =========================
	for p in [audio_path, img_path]:
	if p and os.path.exists(p):
	try:
	os.remove(p)
	print("🗑 Удалён файл:", p)
	except:
	pass

	return video_path, error_msg


	# =========================
	# Интерфейс Gradio
	# =========================

	title = "🎓 Бейне Оқытушы"

	description = """
	Суретіңізді жүктеп, дәріс мәтінін орыс тілінде енгізіңіз.
	Жүйе автоматты түрде қазақ тіліне аударады, озвучка жасайды және бейне шығарады!

	Талаптар:
	- Фото: бет анық көрінетін
	- Мәтін: 500 таңбаға дейін
	"""

	iface = gr.Interface(
	fn=inference,
	inputs=[
	gr.Image(type="pil", label="📸 Фото дәріскер"),
	gr.Textbox(
	lines=5,
	label="📝 Дәріс мәтіні (орыс тілінде)",
	placeholder="Мәтінді енгізіңіз..."
	)
	],
	outputs=[
	gr.Video(label="🎬 Дайын бейне"),
	gr.Textbox(label="ℹ️ Мәртебе", interactive=False)
	],
	title=title,
	description=description,
	cache_examples=False,
	flagging_mode="never"
	)

	if __name__ == "__main__":
	iface.launch()