Spaces:

AlserFurma
/

LipSyncAI

Sleeping

App Files Files Community

AlserFurma commited on Dec 1, 2025

Commit

a0c5931

verified ·

1 Parent(s): 3456931

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -131

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Полная исправленная версия app.py
 import gradio as gr
 import os
 from PIL import Image
@@ -10,51 +8,48 @@ from transformers import VitsModel, AutoTokenizer, pipeline
 import scipy.io.wavfile as wavfile
 import traceback
 import random
 # =========================
-# Параметры
 # =========================
 TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
 # =========================
-# Загрузка моделей
 # =========================
 try:
-    # TTS модель (казахский)
     tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
     tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
-    # Перевод ru -> kk
     translator = pipeline(
         "translation",
         model="facebook/nllb-200-distilled-600M",
         device=0 if device == "cuda" else -1
     )
-    # Модель генерации вопросов
     qa_model = pipeline(
         "text2text-generation",
         model="google/flan-t5-small",
         device=0 if device == "cuda" else -1
     )
-    print("✅ Все модели успешно загружены!")
 except Exception as e:
-    raise RuntimeError(f"❌ Ошибка загрузки моделей: {str(e)}")
 # =========================
-# Вспомогательные функции
 # =========================
 def generate_quiz(text: str):
     prompt = (
-        "Сгенерируй один учебный вопрос по этому тексту и дай 1 правильный и 1 неправильный вариант.\n"
-        "Строго используй такой формат (каждая часть с новой строки):\n"
         "QUESTION: ...\n"
         "CORRECT: ...\n"
         "WRONG: ...\n"
@@ -66,44 +61,41 @@ def generate_quiz(text: str):
     except Exception as e:
         raise RuntimeError(f"Ошибка генерации вопроса: {e}")
-    # Унифицируем текст
-    data = out.replace("\r", "")
-    # --- Пытаемся достать через регулярные выражения ---
-    q = re.search(r"QUESTION:\s*(.+)", data, re.IGNORECASE)
-    c = re.search(r"CORRECT:\s*(.+)", data, re.IGNORECASE)
-    w = re.search(r"WRONG:\s*(.+)", data, re.IGNORECASE)
     question = q.group(1).strip() if q else ""
     correct = c.group(1).strip() if c else ""
     wrong = w.group(1).strip() if w else ""
-    # --- Если пусто — fallback парсер ---
     if not (question and correct and wrong):
-        lines = [ln.strip() for ln in data.split('\n') if ln.strip()]
-        for ln in lines:
-            if ln.lower().startswith("question"):
-                question = ln.split(":", 1)[1].strip()
-            elif ln.lower().startswith("correct"):
-                correct = ln.split(":", 1)[1].strip()
-            elif ln.lower().startswith("wrong"):
-                wrong = ln.split(":", 1)[1].strip()
-    # --- Если всё ещё пусто — ошибка ---
     if not (question and correct and wrong):
-        raise ValueError(
-            f"Модель вывела неподходящий формат:\n---\n{out}\n---"
-        )
-    # Случайно перемешать варианты
     options = [correct, wrong]
     random.shuffle(options)
     return question, options, correct
 def synthesize_audio(text_ru: str):
-    """Переводит русскую строку на казахский, синтезирует аудио и возвращает путь к файлу."""
     translation = translator(text_ru, src_lang="rus_Cyrl", tgt_lang="kaz_Cyrl")
     text_kk = translation[0]["translation_text"]
@@ -112,20 +104,22 @@ def synthesize_audio(text_ru: str):
         output = tts_model(**inputs)
     waveform = output.waveform.squeeze().cpu().numpy()
-    if waveform.size == 0:
-        raise ValueError("TTS вернул пустое аудио")
-    audio = (waveform * 32767).astype('int16')
-    sampling_rate = getattr(tts_model.config, 'sampling_rate', 22050)
-    tmpf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    wavfile.write(tmpf.name, sampling_rate, audio)
-    tmpf.close()
-    return tmpf.name
 def make_talking_head(image_path: str, audio_path: str):
     client = Client(TALKING_HEAD_SPACE)
     try:
         result = client.predict(
             image_path=handle_file(image_path),
@@ -135,144 +129,124 @@ def make_talking_head(image_path: str, audio_path: str):
             api_name="/process_image_audio"
         )
     except Exception as e:
-        raise RuntimeError(f"Ошибка Talking Head API: {e}")
-    video_path = None
-    if isinstance(result, tuple) and len(result) > 0:
-        video_data = result[0]
     else:
-        video_data = result
-    if isinstance(video_data, dict):
-        video_path = video_data.get("video") or video_data.get("path") or video_data.get("file")
-    elif isinstance(video_data, str):
-        video_path = video_data
-    if not video_path:
         raise ValueError("API не вернул путь к видео")
-    return video_path
 # =========================
-# Логика Gradio
 # =========================
-def start_lesson(image: Image.Image, text: str, state):
-    if image is None or not text or not text.strip() or len(text) > 500:
-        return None, "", [], [], state
     try:
-        # Генерируем вопрос
         question, options, correct = generate_quiz(text)
-        quiz_ru = f"Вопрос: {question} Варианты: 1) {options[0]} 2) {options[1]}"
-        audio_path = synthesize_audio(quiz_ru)
-        # Сохраняем изображение
-        tmpimg = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        image.save(tmpimg.name)
-        tmpimg.close()
-        image_path = tmpimg.name
-        video_path = make_talking_head(image_path, audio_path)
-        # Стейт
-        state_data = {
-            "image_path": image_path,
             "correct": correct,
-            "options": options
         }
-        # Удаляем аудио
-        if os.path.exists(audio_path):
-            os.remove(audio_path)
-        return video_path, question, options, state_data, state_data
     except Exception as e:
         traceback.print_exc()
         return None, f"Ошибка: {e}", [], [], state
-def answer_selected(selected_option: str, state):
-    if not state:
-        return None, "Ошибка: нет состояния. Нажмите 'Запустить урок'."
-    try:
-        correct = state.get("correct")
-        image_path = state.get("image_path")
-        options = state.get("options", [])
-        if selected_option == correct:
-            reaction_ru = "Молодец!"
-            display_message = "Дұрыс!"
-        else:
-            reaction_ru = f"Неправильно. Правильный ответ: {correct}"
-            display_message = f"Қате. Дұрыс жауап: {correct}"
-        audio_path = synthesize_audio(reaction_ru)
-        reaction_video = make_talking_head(image_path, audio_path)
-        if os.path.exists(audio_path):
-            os.remove(audio_path)
-        return reaction_video, display_message
-    except Exception as e:
-        traceback.print_exc()
-        return None, f"Ошибка: {e}"
 # =========================
-# Интерфейс Gradio
 # =========================
-title = "🎓 Интерактивный бейне-лектор"
 description = (
-    "Загрузите фото лектора и текст лекции (рус., до 500 символов).<br>"
-    "Система создаст видео-лектора, задаст вопрос и предложит 2 варианта ответа.<br>"
-    "После выбора варианта — лектор коротко ответит по-казахски."
 )
 with gr.Blocks() as demo:
     gr.Markdown(f"# {title}<br>{description}")
     with gr.Row():
-        with gr.Column(scale=1):
-            inp_image = gr.Image(type='pil', label="📸 Фото лектора")
-            inp_text = gr.Textbox(lines=5, label="📝 Текст лекции (рус.)")
-            btn_start = gr.Button("Запустить урок")
-        with gr.Column(scale=1):
             out_video = gr.Video(label="🎬 Видео лектора")
             out_question = gr.Markdown(label="Вопрос")
-            btn_opt1 = gr.Button("Вариант 1")
-            btn_opt2 = gr.Button("Вариант 2")
-            out_reaction_video = gr.Video(label="🎥 Реакция лектора")
-            out_status = gr.Textbox(label="ℹ️ Статус", interactive=False)
-    lesson_state = gr.State({})
-    btn_start.click(
-        fn=start_lesson,
-        inputs=[inp_image, inp_text, lesson_state],
-        outputs=[out_video, out_question, btn_opt1, btn_opt2, lesson_state]
-    )
-    btn_opt1.click(fn=answer_selected, inputs=[btn_opt1, lesson_state],
-                   outputs=[out_reaction_video, out_status])
-    btn_opt2.click(fn=answer_selected, inputs=[btn_opt2, lesson_state],
-                   outputs=[out_reaction_video, out_status])
-    demo.load(lambda: "Готово", outputs=out_status)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import os
 from PIL import Image
 import scipy.io.wavfile as wavfile
 import traceback
 import random
+import re
 # =========================
+# ПАРАМЕТРЫ
 # =========================
 TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Device:", device)
 # =========================
+# ЗАГРУЗКА МОДЕЛЕЙ
 # =========================
 try:
     tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
     tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
     translator = pipeline(
         "translation",
         model="facebook/nllb-200-distilled-600M",
         device=0 if device == "cuda" else -1
     )
     qa_model = pipeline(
         "text2text-generation",
         model="google/flan-t5-small",
         device=0 if device == "cuda" else -1
     )
+    print("Модели успешно загружены!")
 except Exception as e:
+    raise RuntimeError(f"Ошибка при загрузке моделей: {e}")
 # =========================
+# ГЕНЕРАЦИЯ ВОПРОСА
 # =========================
 def generate_quiz(text: str):
     prompt = (
+        "Сгенерируй учебный вопрос по тексту и дай 1 правильный и 1 неправильный вариант ответа.\n"
+        "СТРОГО ИСПОЛЬЗУЙ ФОРМАТ:\n"
         "QUESTION: ...\n"
         "CORRECT: ...\n"
         "WRONG: ...\n"
     except Exception as e:
         raise RuntimeError(f"Ошибка генерации вопроса: {e}")
+    text_out = out.replace("\r", "").strip()
+    # --- Regular expressions ---
+    q = re.search(r"QUESTION:\s*(.+)", text_out, re.IGNORECASE)
+    c = re.search(r"CORRECT:\s*(.+)", text_out, re.IGNORECASE)
+    w = re.search(r"WRONG:\s*(.+)", text_out, re.IGNORECASE)
     question = q.group(1).strip() if q else ""
     correct = c.group(1).strip() if c else ""
     wrong = w.group(1).strip() if w else ""
+    # --- fallback ---
     if not (question and correct and wrong):
+        lines = [l.strip() for l in text_out.split("\n") if l.strip()]
+        for l in lines:
+            if l.lower().startswith("question"):
+                question = l.split(":", 1)[1].strip()
+            elif l.lower().startswith("correct"):
+                correct = l.split(":", 1)[1].strip()
+            elif l.lower().startswith("wrong"):
+                wrong = l.split(":", 1)[1].strip()
     if not (question and correct and wrong):
+        raise ValueError(f"Модель вывела неправильный формат:\n{out}")
     options = [correct, wrong]
     random.shuffle(options)
     return question, options, correct
+# =========================
+# АУДИО НА КАЗАХСКОМ
+# =========================
 def synthesize_audio(text_ru: str):
     translation = translator(text_ru, src_lang="rus_Cyrl", tgt_lang="kaz_Cyrl")
     text_kk = translation[0]["translation_text"]
         output = tts_model(**inputs)
     waveform = output.waveform.squeeze().cpu().numpy()
+    audio = (waveform * 32767).astype("int16")
+    sr = getattr(tts_model.config, "sampling_rate", 22050)
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    wavfile.write(tmp.name, sr, audio)
+    tmp.close()
+    return tmp.name
+# =========================
+# TALKING HEAD
+# =========================
 def make_talking_head(image_path: str, audio_path: str):
     client = Client(TALKING_HEAD_SPACE)
     try:
         result = client.predict(
             image_path=handle_file(image_path),
             api_name="/process_image_audio"
         )
     except Exception as e:
+        raise RuntimeError(f"Ошибка вызова Talking Head API: {e}")
+    if isinstance(result, tuple):
+        result = result[0]
+    if isinstance(result, dict):
+        video = result.get("video") or result.get("file") or result.get("path")
     else:
+        video = result
+    if not video:
         raise ValueError("API не вернул путь к видео")
+    return video
 # =========================
+# GRADIO — ШАГ 1
 # =========================
+def start_lesson(image, text, state):
+    if image is None:
+        return None, "Загрузите фото", [], [], state
+    if not text or len(text) > 500:
+        return None, "Введите текст (до 500 символов)", [], [], state
     try:
         question, options, correct = generate_quiz(text)
+        quiz_ru = f"Вопрос: {question}. Варианты: 1) {options[0]}, 2) {options[1]}"
+        audio = synthesize_audio(quiz_ru)
+        tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+        image.convert("RGB").save(tmp.name)
+        image_path = tmp.name
+        tmp.close()
+        video = make_talking_head(image_path, audio)
+        os.remove(audio)
+        state_new = {
             "correct": correct,
+            "options": options,
+            "image_path": image_path
         }
+        return video, question, options, state_new, state_new
     except Exception as e:
         traceback.print_exc()
         return None, f"Ошибка: {e}", [], [], state
+# =========================
+# GRADIO — ШАГ 2
+# =========================
+def answer_selected(selected, state):
+    if not state:
+        return None, "Ошибка: урок не запущен."
+    correct = state["correct"]
+    image_path = state["image_path"]
+    if selected == correct:
+        text_ru = "Молодец!"
+        message = "Дұрыс!"
+    else:
+        text_ru = f"Неправильно. Правильный ответ: {correct}"
+        message = f"Қате. Дұрыс жауап: {correct}"
+    audio = synthesize_audio(text_ru)
+    video = make_talking_head(image_path, audio)
+    os.remove(audio)
+    return video, message
 # =========================
+# UI
 # =========================
+title = "🎓 Интерактивный видео-лектор"
 description = (
+    "Загрузите фото и текст (рус.).<br>"
+    "Лектор задаст вопрос и предложит варианты.<br>"
+    "После выбора — ответит по-казахски."
 )
 with gr.Blocks() as demo:
     gr.Markdown(f"# {title}<br>{description}")
     with gr.Row():
+        with gr.Column():
+            img = gr.Image(type='pil', label="📸 Фото лектора")
+            txt = gr.Textbox(lines=5, label="📝 Текст лекции (до 500 символов)")
+            btn = gr.Button("Запустить урок")
+        with gr.Column():
             out_video = gr.Video(label="🎬 Видео лектора")
             out_question = gr.Markdown(label="Вопрос")
+            opt1 = gr.Button("Вариант 1")
+            opt2 = gr.Button("Вариант 2")
+            react_video = gr.Video(label="🎥 Реакция")
+            status = gr.Textbox(label="Статус", interactive=False)
+    state = gr.State({})
+    btn.click(start_lesson, [img, txt, state],
+              [out_video, out_question, opt1, opt2, state])
+    opt1.click(answer_selected, [opt1, state], [react_video, status])
+    opt2.click(answer_selected, [opt2, state], [react_video, status])
+    demo.load(lambda: "Готово", outputs=status)
 if __name__ == "__main__":
     demo.launch()