Spaces:

AlserFurma
/

Yapi

Build error

App Files Files Community

AlserFurma commited on Dec 1, 2025

Commit

b9b0a3a

verified ·

1 Parent(s): 8e44e35

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -338

app.py CHANGED Viewed

@@ -6,386 +6,168 @@ from gradio_client import Client, handle_file
 import torch
 from transformers import VitsModel, AutoTokenizer
 import scipy.io.wavfile as wavfile
-import traceback
-import base64
-import random
-import numpy as np  # Фикс для модели (np.prod)
-# Принудительно CPU и минимальное использование памяти
-os.environ['CUDA_VISIBLE_DEVICES'] = ''
-torch.set_num_threads(2) # Ограничение потоков CPU
-device = "cpu"
-print(f"Using device: {device} (optimized mode)")
-# Глобальные переменные
-tts_model = None
-tts_tokenizer = None
 TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
-def load_tts_model():
-    """Загрузка только TTS модели"""
-    global tts_model, tts_tokenizer
-    if tts_model is None:
-        print("Загрузка TTS модели (казахский)...")
-        tts_model = VitsModel.from_pretrained(
-            "facebook/mms-tts-kaz",
-            torch_dtype=torch.float32,
-            low_cpu_mem_usage=True
-        )
-        tts_model.eval() # Режим инференса
-        tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
-        print("✓ TTS модель загружена")
-    return True
-def simple_translate_to_kazakh(russian_text):
-    """
-    Упрощенная транслитерация/перевод без тяжелых моделей
-    Для реального использования нужна легкая модель или API
-    """
-    # Простая замена для базовых слов (демо)
-    translations = {
-        'привет': 'сәлем',
-        'здравствуйте': 'сәлеметсіздер ме',
-        'спасибо': 'рахмет',
-        'пожалуйста': 'өтінемін',
-        'да': 'иә',
-        'нет': 'жоқ',
-        'сегодня': 'бүгін',
-        'завтра': 'ертең',
-        'математика': 'математика',
-        'физика': 'физика',
-        'урок': 'сабақ',
-        'лекция': 'дәріс',
-        'студент': 'студент',
-        'учитель': 'мұғалім',
-        'школа': 'мектеп',
-        'университет': 'университет',
-        'знание': 'білім',
-        'книга': 'кітап',
-        'вопрос': 'сұрақ',
-        'ответ': 'жауап'
-    }
-    text_lower = russian_text.lower()
-    result = russian_text
-    for ru, kk in translations.items():
-        result = result.replace(ru, kk)
-        result = result.replace(ru.capitalize(), kk.capitalize())
-    return result
 def inference(image: Image.Image, text: str):
     error_msg = ""
     video_path = None
     audio_path = None
     img_path = None
     try:
-        # Загрузка TTS
-        if not load_tts_model():
-            raise RuntimeError("Не удалось загрузить TTS модель")
-        # Валидация
         if image is None:
             raise ValueError("Загрузите изображение лектора!")
         if not text or not text.strip():
             raise ValueError("Введите текст лекции!")
         if len(text) > 500:
-            raise ValueError("Текст слишком длинный! Максимум 500 символов.")
-        print(f"Входной текст: '{text[:50]}...'")
-        # Простой перевод на казахский
-        translated_text = simple_translate_to_kazakh(text)
-        print(f"Переведенный текст: '{translated_text[:50]}...'")
-        # Генерация аудио с оптимизацией памяти
-        print("Генерация аудио...")
         with torch.no_grad():
-            inputs = tts_tokenizer(translated_text, return_tensors="pt", truncation=True, max_length=512)
-            # Освобождение памяти перед генерацией
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
             output = tts_model(**inputs)
             waveform = output.waveform.squeeze().cpu().numpy()
-            # Очистка
-            del inputs, output
         if waveform.size == 0:
-            raise ValueError("TTS сгенерировал пустое аудио!")
-        # Сохранение аудио
         audio = (waveform * 32767).astype("int16")
         sampling_rate = tts_model.config.sampling_rate
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as audio_file:
             wavfile.write(audio_file.name, sampling_rate, audio)
             audio_path = audio_file.name
-        print(f"✓ Аудио: {audio_path} ({len(waveform)/sampling_rate:.1f} сек)")
-        # Оптимизация изображения
-        print("Обработка изображения...")
-        if image.mode != 'RGB':
-            image = image.convert('RGB')
-        # Уменьшаем размер если слишком большое (экономия памяти)
-        max_size = 1024
-        if max(image.size) > max_size:
-            ratio = max_size / max(image.size)
-            new_size = tuple(int(dim * ratio) for dim in image.size)
-            image = image.resize(new_size, Image.Resampling.LANCZOS)
-            print(f"Изображение уменьшено до {new_size}")
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as img_file:
-            image.save(img_file.name, format='PNG', optimize=True)
             img_path = img_file.name
-        print(f"✓ Изображение: {img_path}")
-        # Вызов Talking Head API
         print(f"Подключение к {TALKING_HEAD_SPACE}...")
-        client = Client(TALKING_HEAD_SPACE, verbose=False)
         result = client.predict(
             image_path=handle_file(img_path),
             audio_path=handle_file(audio_path),
-            guidance_scale=2.5, # Снижено для скорости
-            steps=8, # Меньше шагов = быстрее
             api_name="/process_image_audio"
         )
         # Обработка результата
         if isinstance(result, tuple) and len(result) > 0:
             video_data = result[0]
-            if isinstance(video_data, dict):
-                video_path = video_data.get('video') or video_data.get('path')
             elif isinstance(video_data, str):
                 video_path = video_data
             else:
-                video_path = str(video_data)
-        elif isinstance(result, str):
-            video_path = result
         else:
-            raise ValueError("Неизвестный формат результата от API")
-        if not video_path or not os.path.exists(video_path):
-            raise ValueError("Видео не сгенерировано!")
-        print(f"✓ Видео: {video_path}")
-        error_msg = "✅ Бейне сәтті жасалды!"
     except Exception as e:
-        error_msg = f"❌ Қате: {str(e)}"
         print(f"ОШИБКА: {error_msg}")
         traceback.print_exc()
     finally:
         # Очистка временных файлов
-        for path in [audio_path, img_path]:
-            if path and os.path.exists(path):
-                try:
-                    os.remove(path)
-                except:
-                    pass
     return video_path, error_msg
-def generate_interactive_lesson(text, video_path):
-    """Упрощенная версия без тяжелых моделей QA"""
-    try:
-        if not video_path or not os.path.exists(video_path):
-            return "<p style='color: red;'>❌ Алдымен бейнені жасаңыз!</p>"
-        # Простая генерация вопросов без ML моделей
-        sentences = text.split('.')[:3] # Первые 3 предложения
-        questions = []
-        for i, sent in enumerate(sentences):
-            sent = sent.strip()
-            if len(sent) < 10:
-                continue
-            # Простые шаблоны вопросов
-            words = sent.split()
-            if len(words) < 3:
-                continue
-            # Генерируем вопрос на основе шаблона
-            question_templates = [
-                f"Не сказано о {words[0].lower()}?",
-                f"Что упоминается в тексте о {words[1].lower() if len(words) > 1 else 'теме'}?",
-                f"Какая информация дана о {words[2].lower() if len(words) > 2 else 'содержании'}?"
-            ]
-            question = random.choice(question_templates)
-            # Правильный ответ - часть предложения
-            correct = ' '.join(words[:min(5, len(words))])
-            # Неправильные ответы
-            wrong_options = [
-                "Бұл туралы айтылмаған",
-                "Мәтінде жоқ",
-                "Дұрыс емес жауап"
-            ]
-            wrong = random.choice(wrong_options)
-            questions.append({
-                "question": question,
-                "correct": correct,
-                "wrong": wrong
-            })
-        if not questions:
-            # Создаем хотя бы один вопрос
-            questions.append({
-                "question": "Дәрістің негізгі тақырыбы не?",
-                "correct": text.split('.')[0][:50] if text else "Білім",
-                "wrong": "Спорт туралы"
-            })
-        # Base64 видео (оптимизировано)
-        print("Кодирование видео в base64...")
-        with open(video_path, 'rb') as f:
-            video_data = f.read()
-            # Проверка размера
-            if len(video_data) > 50 * 1024 * 1024: # 50MB
-                return "<p style='color: orange;'>⚠️ Видео слишком большое для встраивания. Скачайте его отдельно.</p>"
-            video_base64 = base64.b64encode(video_data).decode('utf-8')
-        # Минимальный HTML
-        html = f"""<!DOCTYPE html>
-<html>
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Интерактивті сабақ</title>
-    <style>
-        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
-        body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 15px; background: #f5f5f5; }}
-        h1 {{ color: #333; text-align: center; margin: 20px 0; font-size: 24px; }}
-        video {{ width: 100%; max-width: 600px; display: block; margin: 20px auto; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); }}
-        .text {{ background: white; padding: 15px; margin: 20px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
-        .q {{ background: white; padding: 15px; margin: 15px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
-        button {{ background: #4CAF50; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer; margin-top: 10px; }}
-        button:hover {{ background: #45a049; }}
-        .fb {{ margin-top: 10px; padding: 8px; border-radius: 5px; font-weight: bold; }}
-        label {{ cursor: pointer; }}
-    </style>
-</head>
-<body>
-    <h1>📚 Интерактивті сабақ</h1>
-    <video controls><source src="data:video/mp4;base64,{video_base64}" type="video/mp4"></video>
-    <div class="text"><strong>Дәріс мәтіні:</strong> {text[:500]}</div>
-    <h2 style="text-align:center; margin: 20px 0;">Тесттер:</h2>
-"""
-        for i, q in enumerate(questions):
-            ca = q['correct'].replace("'", "\\'").replace('"', '&quot;')
-            html += f"""
-    <div class="q">
-        <p><strong>Сұрақ {i+1}:</strong> {q['question']}</p>
-        <div style="margin: 10px 0;">
-            <input type="radio" name="q{i}" value="c" id="c{i}">
-            <label for="c{i}">{q['correct']}</label><br>
-            <input type="radio" name="q{i}" value="w" id="w{i}">
-            <label for="w{i}">{q['wrong']}</label>
-        </div>
-        <button onclick="check({i},'{ca}')">Тексеру</button>
-        <div class="fb" id="fb{i}"></div>
-    </div>
-"""
-        html += """
-    <script>
-    function check(i, c) {
-        var s = document.query_selector('input[name="q'+i+'"]:checked');
-        var f = document.getElementById('fb'+i);
-        if(!s) { f.innerHTML='⚠️ Жауап таңдаңыз!'; f.style.background='#fff3cd'; f.style.color='#856404'; return; }
-        if(s.value==='c') { f.innerHTML='✅ Дұрыс!'; f.style.background='#d4edda'; f.style.color='#155724'; }
-        else { f.innerHTML='❌ Қате. Дұрыс: '+c; f.style.background='#f8d7da'; f.style.color='#721c24'; }
-    }
-    </script>
-</body>
-</html>"""
-        escaped = html.replace('\\', '\\\\').replace('`', '\\`').replace('${', '\\${')
-        return f"""
-<div style="text-align:center; padding: 20px; background: white; border-radius: 8px;">
-    <h3 style="color: #2c3e50;">✅ Интерактивті сабақ дайын!</h3>
-    <button onclick="var w=window.open('','_blank');w.document.write(`{escaped}`);w.document.close();"
-            style="background: #27ae60; color: white; padding: 15px 30px; font-size: 16px; border: none;
-            border-radius: 8px; cursor: pointer; margin-top: 15px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
-        📖 Интерактивті сабақты ашу
-    </button>
-</div>
 """
-    except Exception as e:
-        traceback.print_exc()
-        return f"<p style='color: red;'>❌ Қате: {str(e)}</p>"
-# Интерфейс
-with gr.Blocks(theme=gr.themes.Soft(), title="Бейне Оқытушы", css="""
-    .gradio-container {max-width: 1200px !important;}
-    footer {display: none !important;}
-""") as iface:
-    gr.Markdown("""
-    # 🎓 Бейне Оқытушы (CPU Оптимизацияланған)
-    **Қалай пайдалану:**
-    1. 📸 Суретіңізді жүктеңіз (бет анық көрінетін)
-    2. 📝 Дәріс мәтінін орыс тілінде енгізіңіз (500 таңбаға дейін)
-    3. 🎬 "Бейнені жасау" батырмасын басыңыз
-    4. 📚 Дайын болғаннан кейін "Интерактивті сабақ" жасай аласыз
-    ⚡ **Ескерту:** CPU режимінде жұмыс істейді, генерация 1-3 минут алуы мүмкін.
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="📸 Дәріскер суреті")
-            text_input = gr.Textbox(
-                lines=6,
-                placeholder="Мысалы: Сәлеметсіздер ме! Бүгін біз математика туралы сөйлесеміз...",
-                label="📝 Дәріс мәтіні (орыс тілінде)"
-            )
-            generate_btn = gr.Button("🎬 Бейнені жасау", variant="primary", size="lg")
-        with gr.Column(scale=1):
-            video_output = gr.Video(label="🎬 Дайын бейне")
-            status = gr.Textbox(label="ℹ️ Мәртебе", interactive=False)
-    interactive_btn = gr.Button("📚 Интерактивті сабақ жасау", visible=False, variant="secondary")
-    lesson_output = gr.HTML(value="", label="Интерактивті сабақ", visible=False)
-    def show_lesson_btn(video, status_msg):
-        return gr.update(visible=bool(video and "✅" in status_msg))
-    generate_btn.click(
-        inference,
-        inputs=[image_input, text_input],
-        outputs=[video_output, status]
-    ).then(
-        show_lesson_btn,
-        inputs=[video_output, status],
-        outputs=interactive_btn
-    )
-    interactive_btn.click(
-        generate_interactive_lesson,
-        inputs=[text_input, video_output],
-        outputs=lesson_output
-    ).then(
-        lambda: gr.update(visible=True),
-        outputs=lesson_output
-    )
 if __name__ == "__main__":
-    iface.launch(
-        server_name="0.0.0.0",
-        server_port=7860
-    )

 import torch
 from transformers import VitsModel, AutoTokenizer
 import scipy.io.wavfile as wavfile
+# Загрузка обновленной TTS модели при старте
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+try:
+    tts_model = VitsModel.from_pretrained("facebook/mms-tts-rus").to(device)
+    tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")
+    print("TTS модель загружена успешно!")
+except Exception as e:
+    raise RuntimeError(f"Ошибка загрузки TTS модели: {str(e)}")
+# Пространство для talking-head
 TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
 def inference(image: Image.Image, text: str):
     error_msg = ""
     video_path = None
     audio_path = None
     img_path = None
     try:
+        # Валидация входных данных
         if image is None:
             raise ValueError("Загрузите изображение лектора!")
         if not text or not text.strip():
             raise ValueError("Введите текст лекции!")
         if len(text) > 500:
+            raise ValueError("Текст слишком длинный! Используйте до 500 символов.")
+        print(f"Генерация TTS для текста: '{text[:50]}...'")
+        # Шаг 1: Генерация аудио через TTS
+        torch.manual_seed(42)
+        inputs = tts_tokenizer(text, return_tensors="pt").to(device)
         with torch.no_grad():
             output = tts_model(**inputs)
             waveform = output.waveform.squeeze().cpu().numpy()
         if waveform.size == 0:
+            raise ValueError("TTS сгенерировал пустое аудио! Попробуйте другой текст.")
+        # Конвертация в int16 для WAV
         audio = (waveform * 32767).astype("int16")
         sampling_rate = tts_model.config.sampling_rate
+        # Сохранение аудио
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as audio_file:
             wavfile.write(audio_file.name, sampling_rate, audio)
             audio_path = audio_file.name
+        print(f"TTS аудио сохранено: {audio_path} (длина: {len(waveform)/sampling_rate:.1f} сек)")
+        # Шаг 2: Сохранение изображения
         with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as img_file:
+            # Конвертация в RGB если нужно
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+            image.save(img_file.name, format='PNG')
             img_path = img_file.name
+        print(f"Изображение сохранено: {img_path}")
+        # Шаг 3: Вызов talking-head API
         print(f"Подключение к {TALKING_HEAD_SPACE}...")
+        client = Client(TALKING_HEAD_SPACE)
+        # Проверяем доступные API endpoints
+        print("Доступные API методы:", client.view_api())
+        # Вызов API с правильными параметрами
         result = client.predict(
             image_path=handle_file(img_path),
             audio_path=handle_file(audio_path),
+            guidance_scale=3.0,
+            steps=10,
             api_name="/process_image_audio"
         )
+        print(f"Результат API: {type(result)}")
         # Обработка результата
         if isinstance(result, tuple) and len(result) > 0:
             video_data = result[0]
+            if isinstance(video_data, dict) and 'video' in video_data:
+                video_path = video_data['video']
+            elif isinstance(video_data, dict) and 'path' in video_data:
+                video_path = video_data['path']
             elif isinstance(video_data, str):
                 video_path = video_data
             else:
+                video_path = video_data
         else:
+            video_path = result
+        print(f"Видео сгенерировано: {video_path}")
+        error_msg = "✅ Видео успешно сгенерировано!"
     except Exception as e:
+        error_msg = f"❌ Ошибка: {str(e)}"
         print(f"ОШИБКА: {error_msg}")
+        import traceback
         traceback.print_exc()
     finally:
         # Очистка временных файлов
+        if audio_path and os.path.exists(audio_path):
+            try:
+                os.remove(audio_path)
+                print(f"Удален временный файл: {audio_path}")
+            except:
+                pass
+        if img_path and os.path.exists(img_path):
+            try:
+                os.remove(img_path)
+                print(f"Удален временный файл: {img_path}")
+            except:
+                pass
     return video_path, error_msg
+# Интерфейс Gradio
+title = "Видео-лектор с TTS (Русский)"
+description = """
+Загрузите фото лектора и введите текст лекции.
+Система сгенерирует видео, где лектор "произносит" ваш текст!
+**Требования:**
+- Фото: фронтальное изображение лица
+- Текст: до 500 символов на русском языке
 """
+examples = [
+    [
+        "example_image.png",
+        "Добрый день! Сегодня мы рассмотрим основы машинного обучения."
+    ]
+]
+iface = gr.Interface(
+    fn=inference,
+    inputs=[
+        gr.Image(type="pil", label="📸 Фото лектора"),
+        gr.Textbox(
+            lines=5,
+            placeholder="Введите текст лекции на русском языке (до 500 символов)...",
+            label="📝 Текст лекции"
+        )
+    ],
+    outputs=[
+        gr.Video(label="🎬 Готовое видео"),
+        gr.Textbox(label="ℹ️ Статус", interactive=False)
+    ],
+    title=title,
+    description=description,
+    flagging_mode="never",
+    examples=None,  # Добавьте примеры, если есть тестовые изображения
+    cache_examples=False
+)
 if __name__ == "__main__":
+    iface.launch()