import gradio as gr
import os
from PIL import Image
import tempfile
from gradio_client import Client, handle_file
import torch
from transformers import VitsModel, AutoTokenizer, pipeline
import scipy.io.wavfile as wavfile
import traceback


# =========================
# Загрузка моделей
# =========================

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    # TTS модель (казахский)
    tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
    tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")

    # Перевод ru -> kk
    translator = pipeline(
        "translation",
        model="facebook/nllb-200-distilled-600M",
        device=0 if device == "cuda" else -1
    )

    print("✅ Все модели успешно загружены!")

except Exception as e:
    raise RuntimeError(f"❌ Ошибка загрузки моделей: {str(e)}")


# =========================
# Talking Head API
# =========================

TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"


# =========================
# Основная функция
# =========================

def inference(image: Image.Image, text: str):

    error_msg = ""
    video_path = None
    audio_path = None
    img_path = None

    try:
        # =========================
        # Проверка входных данных
        # =========================
        if image is None:
            raise ValueError("Загрузите изображение лектора!")

        if not text or not text.strip():
            raise ValueError("Введите текст лекции!")

        if len(text) > 500:
            raise ValueError("Текст превышает 500 символов!")

        print("📥 Ввод (RU):", text)

        # =========================
        # Шаг 1 — Перевод
        # =========================
        translation = translator(
            text,
            src_lang="rus_Cyrl",
            tgt_lang="kaz_Cyrl"
        )

        translated_text = translation[0]["translation_text"]
        print("🌍 Перевод (KK):", translated_text)

        if not translated_text.strip():
            raise ValueError("Перевод не удался!")

        # =========================
        # Шаг 2 — Озвучка
        # =========================
        inputs = tts_tokenizer(translated_text, return_tensors="pt").to(device)

        with torch.no_grad():
            output = tts_model(**inputs)

        waveform = output.waveform.squeeze().cpu().numpy()

        if waveform.size == 0:
            raise ValueError("TTS вернул пустое аудио!")

        audio = (waveform * 32767).astype("int16")
        sampling_rate = tts_model.config.sampling_rate

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            wavfile.write(f.name, sampling_rate, audio)
            audio_path = f.name

        print("🔊 Аудио создано:", audio_path)

        # =========================
        # Шаг 3 — Сохранение фото
        # =========================
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
            if image.mode != "RGB":
                image = image.convert("RGB")
            image.save(f.name)
            img_path = f.name

        print("🖼 Фото сохранено:", img_path)

        # =========================
        # Шаг 4 — Генерация видео
        # =========================
        print("🎥 Подключение к SkyReels...")
        client = Client(TALKING_HEAD_SPACE)

        result = client.predict(
            image_path=handle_file(img_path),
            audio_path=handle_file(audio_path),
            guidance_scale=3.0,
            steps=10,
            api_name="/process_image_audio"
        )

        print("✅ RAW RESULT:", result)

        # =========================
        # Универсальный разбор результата
        # =========================

        if isinstance(result, tuple) and len(result) > 0:
            video_data = result[0]
        elif isinstance(result, dict):
            video_data = result
        else:
            raise ValueError(f"Неизвестный формат ответа API: {type(result)}")

        if isinstance(video_data, dict):
            video_path = (
                video_data.get("video")
                or video_data.get("path")
                or video_data.get("file")
            )

        elif isinstance(video_data, str):
            video_path = video_data

        else:
            raise ValueError(f"Не удалось извлечь видео: {type(video_data)}")

        if not video_path:
            raise ValueError("API не вернул путь к видео!")

        print("✅ Видео создано:", video_path)
        error_msg = "✅ Видео успешно создано!"

    except Exception as e:
        error_msg = f"❌ Ошибка: {str(e)}"
        print(error_msg)
        traceback.print_exc()

    finally:
        # =========================
        # Очистка временных файлов
        # =========================
        for p in [audio_path, img_path]:
            if p and os.path.exists(p):
                try:
                    os.remove(p)
                    print("🗑 Удалён файл:", p)
                except:
                    pass

    return video_path, error_msg


# =========================
# Интерфейс Gradio
# =========================

title = "🎓 Бейне Оқытушы"

description = """
Суретіңізді жүктеп, дәріс мәтінін **орыс тілінде** енгізіңіз.
Жүйе автоматты түрде қазақ тіліне аударады, озвучка жасайды және бейне шығарады!

**Талаптар:**
- Фото: бет анық көрінетін
- Мәтін: 500 таңбаға дейін
"""

iface = gr.Interface(
    fn=inference,
    inputs=[
        gr.Image(type="pil", label="📸 Фото дәріскер"),
        gr.Textbox(
            lines=5,
            label="📝 Дәріс мәтіні (орыс тілінде)",
            placeholder="Мәтінді енгізіңіз..."
        )
    ],
    outputs=[
        gr.Video(label="🎬 Дайын бейне"),
        gr.Textbox(label="ℹ️ Мәртебе", interactive=False)
    ],
    title=title,
    description=description,
    cache_examples=False,
    flagging_mode="never"
)

if __name__ == "__main__":
    iface.launch()