Spaces:

AlserFurma
/

LipSyncAI

Sleeping

App Files Files Community

AlserFurma commited on Dec 1, 2025

Commit

444e569

verified ·

1 Parent(s): 1873d97

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -135

app.py CHANGED Viewed

@@ -8,24 +8,20 @@ from transformers import VitsModel, AutoTokenizer, pipeline
 import scipy.io.wavfile as wavfile
 import traceback
 import random
-import json
-import re
 # =========================
 # Параметры
 # =========================
 TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Device set to use {device}")
 # =========================
 # Загрузка моделей
 # =========================
 try:
-    # TTS (казахский)
     tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
     tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
@@ -36,86 +32,51 @@ try:
         device=0 if device == "cuda" else -1
     )
-    # Генерация учебных вопросов (стабильная CPU-модель)
     qa_model = pipeline(
         "text2text-generation",
-        model="t5-base",   # <-- ВАЖНО: существующая стабильная модель!
         device=0 if device == "cuda" else -1
     )
-    print("Models loaded successfully!")
 except Exception as e:
-    raise RuntimeError(f"Model loading error: {str(e)}")
 # =========================
-# Генерация учебного вопроса
 # =========================
 def generate_quiz(text: str):
     prompt = (
-        "Сгенерируй учебный вопрос по тексту и дай один правильный и один неправильный ответ. "
-        "Верни ТОЛЬКО JSON без комментариев:\n"
-        "{\n"
-        "  \"question\": \"...\",\n"
-        "  \"correct\": \"...\",\n"
-        "  \"wrong\": \"...\"\n"
-        "}\n"
-        f"TEXT: {text}"
     )
-    # 1. Генерация
-    out = qa_model(prompt, max_new_tokens=200)[0]["generated_text"].strip()
-    # 2. Повторная попытка при пустом выводе
-    if not out:
-        out = qa_model(prompt, max_new_tokens=200)[0]["generated_text"].strip()
-        if not out:
-            raise ValueError("Модель дважды вернула пустой ответ.")
-    # 3. Извлечение JSON
     try:
-        json_str = out[out.index("{"): out.rindex("}") + 1]
-    except Exception:
-        # fallback
-        q = re.search(r'"?question"?\s*[:=]\s*[\'"](.+?)[\'"]', out)
-        c = re.search(r'"?correct"?\s*[:=]\s*[\'"](.+?)[\'"]', out)
-        w = re.search(r'"?wrong"?\s*[:=]\s*[\'"](.+?)[\'"]', out)
-        if q and c and w:
-            json_str = json.dumps({
-                "question": q.group(1),
-                "correct": c.group(1),
-                "wrong": w.group(1)
-            })
-        else:
             raise ValueError(f"Модель вывела неподходящий формат:\n{out}")
-    json_str = json_str.replace("\n", "")
-    try:
-        data = json.loads(json_str)
-    except Exception:
-        data = json.loads(json_str.replace("'", "\""))
-    question = data.get("question", "").strip()
-    correct = data.get("correct", "").strip()
-    wrong = data.get("wrong", "").strip()
-    if not (question and correct and wrong):
-        raise ValueError("JSON не содержит нужных полей")
-    options = [correct, wrong]
-    random.shuffle(options)
-    return question, options, correct
-# =========================
-# Синтез речи
-# =========================
 def synthesize_audio(text_ru: str):
     translation = translator(text_ru, src_lang="rus_Cyrl", tgt_lang="kaz_Cyrl")
     text_kk = translation[0]["translation_text"]
@@ -124,20 +85,17 @@ def synthesize_audio(text_ru: str):
         output = tts_model(**inputs)
     waveform = output.waveform.squeeze().cpu().numpy()
-    audio = (waveform * 32767).astype("int16")
-    sr = getattr(tts_model.config, "sampling_rate", 22050)
-    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    wavfile.write(tmp.name, sr, audio)
-    tmp.close()
-    return tmp.name
-# =========================
-# Talking Head
-# =========================
 def make_talking_head(image_path: str, audio_path: str):
     client = Client(TALKING_HEAD_SPACE)
     result = client.predict(
         image_path=handle_file(image_path),
@@ -146,107 +104,119 @@ def make_talking_head(image_path: str, audio_path: str):
         steps=10,
         api_name="/process_image_audio"
     )
-    if isinstance(result, tuple):
-        return result[0]
-    return result
 # =========================
-# Шаг 1 — старт урока
 # =========================
 def start_lesson(image: Image.Image, text: str, state):
-    if image is None:
-        return None, "Загрузите фото", [], state
-    if not text:
-        return None, "Введите текст", [], state
-    if len(text) > 500:
-        return None, "Текст слишком длинный", [], state
     try:
         question, options, correct = generate_quiz(text)
-        quiz_text = f"Вопрос: {question}. Варианты: 1) {options[0]} 2) {options[1]}"
-        audio_path = synthesize_audio(quiz_text)
-        tmpimg = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
-        if image.mode != "RGB":
-            image = image.convert("RGB")
         image.save(tmpimg.name)
         tmpimg.close()
-        video_path = make_talking_head(tmpimg.name, audio_path)
-        state = {
-            "image_path": tmpimg.name,
-            "correct": correct,
-            "options": options
-        }
-        return video_path, question, options, state, state
     except Exception as e:
         traceback.print_exc()
-        return None, f"Ошибка: {e}", [], state
-# =========================
-# Шаг 2 — реакция
-# =========================
 def answer_selected(selected_option: str, state):
     if not state:
-        return None, "Ошибка: урок не запущен"
-    correct = state["correct"]
-    image_path = state["image_path"]
-    if selected_option == correct:
-        reply_ru = "Молодец!"
-        reply_ui = "Дұрыс!"
-    else:
-        reply_ru = f"Неправильно. Правильный ответ: {correct}"
-        reply_ui = f"Қате. Дұрыс жауап: {correct}"
-    audio_path = synthesize_audio(reply_ru)
-    video_path = make_talking_head(image_path, audio_path)
-    return video_path, reply_ui
 # =========================
-# Интерфейс
 # =========================
 with gr.Blocks() as demo:
-    gr.Markdown("# 🎓 Интерактивный бейне-лектор")
     with gr.Row():
-        with gr.Column():
-            inp_image = gr.Image(type="pil", label="Фото лектора")
-            inp_text = gr.Textbox(lines=4, label="Текст лекции (рус.)")
             btn_start = gr.Button("Запустить урок")
-        with gr.Column():
-            out_video = gr.Video(label="Видео лектора")
-            out_question = gr.Markdown(label="Вопрос")
             btn_opt1 = gr.Button("Вариант 1")
             btn_opt2 = gr.Button("Вариант 2")
-            out_react = gr.Video(label="Реакция")
-            out_status = gr.Textbox(label="Статус")
-    state = gr.State({})
     btn_start.click(
-        start_lesson,
-        [inp_image, inp_text, state],
-        [out_video, out_question, btn_opt1, btn_opt2, state]
     )
-    btn_opt1.click(answer_selected, [btn_opt1, state], [out_react, out_status])
-    btn_opt2.click(answer_selected, [btn_opt2, state], [out_react, out_status])
     demo.load(lambda: "Готово", outputs=out_status)
-if __name__ == "__main__":
     demo.launch()

 import scipy.io.wavfile as wavfile
 import traceback
 import random
 # =========================
 # Параметры
 # =========================
 TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
 # =========================
 # Загрузка моделей
 # =========================
 try:
+    # TTS модель (казахский)
     tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
     tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
         device=0 if device == "cuda" else -1
     )
+    # Модель для генерации вопросов
     qa_model = pipeline(
         "text2text-generation",
+        model="google/flan-t5-small",
         device=0 if device == "cuda" else -1
     )
+    print("✅ Все модели успешно загружены!")
 except Exception as e:
+    raise RuntimeError(f"❌ Ошибка загрузки моделей: {str(e)}")
 # =========================
+# Вспомогательные функции
 # =========================
 def generate_quiz(text: str):
+    """Генерирует один вопрос и два варианта (correct, wrong) на русском языке."""
     prompt = (
+        "Сгенерируй один учебный вопрос по этому тексту и дай 1 правильный и 1 неправильный вариант ответа. "
+        "Формат вывода JSON: {\"question\": \"...\", \"correct\": \"...\", \"wrong\": \"...\"}. Текст: " + text
     )
     try:
+        out = qa_model(prompt, max_length=256)[0]["generated_text"]
+        # Пытаемся найти JSON в выводе модели
+        json_start = out.find("{")
+        json_end = out.rfind("}")
+        if json_start == -1 or json_end == -1:
             raise ValueError(f"Модель вывела неподходящий формат:\n{out}")
+        import json
+        data = json.loads(out[json_start: json_end+1])
+        question = data.get("question", "").strip()
+        correct = data.get("correct", "").strip()
+        wrong = data.get("wrong", "").strip()
+        if not (question and correct and wrong):
+            raise ValueError(f"Неполные данные:\n{out}")
+        options = [correct, wrong]
+        random.shuffle(options)
+        return question, options, correct
+    except Exception as e:
+        raise ValueError(f"Ошибка генерации вопроса:\n{str(e)}")
 def synthesize_audio(text_ru: str):
+    """Переводит русскую строку на казахский, синтезирует аудио и возвращает путь к файлу .wav"""
     translation = translator(text_ru, src_lang="rus_Cyrl", tgt_lang="kaz_Cyrl")
     text_kk = translation[0]["translation_text"]
         output = tts_model(**inputs)
     waveform = output.waveform.squeeze().cpu().numpy()
+    audio = (waveform * 32767).astype('int16')
+    sampling_rate = getattr(tts_model.config, 'sampling_rate', 22050)
+    tmpf = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+    wavfile.write(tmpf.name, sampling_rate, audio)
+    tmpf.close()
+    return tmpf.name
 def make_talking_head(image_path: str, audio_path: str):
+    """Вызывает SkyReels/Talking Head space и возвращает путь или URL видео."""
     client = Client(TALKING_HEAD_SPACE)
     result = client.predict(
         image_path=handle_file(image_path),
         steps=10,
         api_name="/process_image_audio"
     )
+    if isinstance(result, dict) and "video" in result:
+        return result["video"]
+    elif isinstance(result, str):
+        return result
+    else:
+        raise ValueError(f"Unexpected talking head result: {type(result)}")
 # =========================
+# Основные обработчики для Gradio
 # =========================
 def start_lesson(image: Image.Image, text: str, state):
+    """Шаг 1: генерируем видео-лекцию с вопросом и вариантами ответа."""
+    if image is None or not text.strip() or len(text) > 500:
+        return None, "", [], [], state
     try:
         question, options, correct = generate_quiz(text)
+        quiz_ru = f"Вопрос: {question} Варианты: 1) {options[0]} 2) {options[1]}"
+        audio_path = synthesize_audio(quiz_ru)
+        tmpimg = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
         image.save(tmpimg.name)
         tmpimg.close()
+        image_path = tmpimg.name
+        video_path = make_talking_head(image_path, audio_path)
+        state_data = {'image_path': image_path, 'correct': correct, 'options': options}
+        # удаляем временный аудио файл
+        try: os.remove(audio_path)
+        except: pass
+        return video_path, question, options, state_data, state_data
     except Exception as e:
         traceback.print_exc()
+        return None, f"Ошибка: {e}", [], [], state
 def answer_selected(selected_option: str, state):
+    """Шаг 2: пользователь выбирает вариант — генерируем реакцию лектора."""
     if not state:
+        return None, "Ошибка: отсутствует состояние урока. Сначала нажмите 'Запустить урок'."
+    try:
+        correct = state.get('correct')
+        image_path = state.get('image_path')
+        options = state.get('options', [])
+        if selected_option == correct:
+            reaction_ru = "Молодец!"
+            display_message = "Дұрыс!"
+        else:
+            reaction_ru = f"Неправильно. Правильный ответ: {correct}"
+            display_message = f"Қате. Дұрыс жауап: {correct}"
+        audio_path = synthesize_audio(reaction_ru)
+        reaction_video = make_talking_head(image_path, audio_path)
+        try: os.remove(audio_path)
+        except: pass
+        return reaction_video, display_message
+    except Exception as e:
+        traceback.print_exc()
+        return None, f"Ошибка: {e}"
 # =========================
+# Gradio UI
 # =========================
+title = "🎓 Интерактивный бейне-лектор"
+description = (
+    "Загрузите фото лектора и текст лекции (русский, до 500 символов). "
+    "Система создаст видео-лектора, задаст вопрос и предложит 2 варианта ответа. "
+    "Нажмите на один из вариантов — лектор коротко отреагирует (қазақша)."
+)
 with gr.Blocks() as demo:
+    gr.Markdown(f"# {title}\n{description}")
     with gr.Row():
+        with gr.Column(scale=1):
+            inp_image = gr.Image(type='pil', label='📸 Фото лектора')
+            inp_text = gr.Textbox(lines=5, label='📝 Текст лекции (рус.)', placeholder='Введите текст...')
             btn_start = gr.Button("Запустить урок")
+        with gr.Column(scale=1):
+            out_video = gr.Video(label='🎬 Видео лектора')
+            out_question = gr.Markdown(label='Вопрос')
             btn_opt1 = gr.Button("Вариант 1")
             btn_opt2 = gr.Button("Вариант 2")
+            out_reaction_video = gr.Video(label='🎥 Реакция лектора')
+            out_status = gr.Textbox(label='ℹ️ Статус', interactive=False)
+    lesson_state = gr.State({})
+    # Привязки
     btn_start.click(
+        fn=start_lesson,
+        inputs=[inp_image, inp_text, lesson_state],
+        outputs=[out_video, out_question, btn_opt1, btn_opt2, lesson_state]
     )
+    btn_opt1.click(fn=answer_selected, inputs=[btn_opt1, lesson_state], outputs=[out_reaction_video, out_status])
+    btn_opt2.click(fn=answer_selected, inputs=[btn_opt2, lesson_state], outputs=[out_reaction_video, out_status])
     demo.load(lambda: "Готово", outputs=out_status)
+if __name__ == '__main__':
     demo.launch()