Spaces:

AlserFurma
/

Yapi

Build error

App Files Files Community

AlserFurma commited on Dec 1, 2025

Commit

8e44e35

verified ·

1 Parent(s): 4434e4a

Update app.py

Browse files

Files changed (1) hide show

app.py +370 -73

app.py CHANGED Viewed

@@ -2,93 +2,390 @@ import gradio as gr
 import os
 from PIL import Image
 import tempfile
 import torch
 from transformers import VitsModel, AutoTokenizer
 import scipy.io.wavfile as wavfile
-from gradio_client import Client, handle_file
 import traceback
-# Только CPU
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
-torch.set_num_threads(4)
-TALKING_HEAD = "Skywork/skyreels-a1-talking-head"
-model = None
-tokenizer = None
-def load_tts():
-    global model, tokenizer
-    if model is None:
-        print("Загружаем TTS (каз)…")
-        model = VitsModel.from_pretrained("facebook/mms-tts-kaz")
-        tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
-        print("TTS готова")
     return True
-def ru_to_kz_simple(text: str) -> str:
-    rep = {
-        "привет": "сәлем", "здравствуйте": "сәлеметсіз бе", "спасибо": "рахмет",
-        "да": "иә", "нет": "жоқ", "сегодня": "бүгін", "завтра": "ертең",
-        "урок": "сабақ", "лекция": "дәріс", "учитель": "мұғалім", "школа": "мектеп"
     }
-    for ru, kz in rep.items():
-        text = text.replace(ru, kz).replace(ru.capitalize(), kz.capitalize())
-    return text
-def create_video(image: Image.Image, text: str):
-    if not image or not text.strip():
-        return None, "Загрузите фото и введите текст!"
-    load_tts()
-    text_kz = ru_to_kz_simple(text.strip())
     try:
-        # TTS
-        inputs = tokenizer(text_kz, return_tensors="pt")
         with torch.no_grad():
-            waveform = model(**inputs).waveform.squeeze().cpu().numpy()
-        rate = model.config.sampling_rate
-        audio_path = "/tmp/audio.wav"
-        wavfile.write(audio_path, rate, (waveform * 32767).astype("int16"))
-        # Изображение
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        img_path = "/tmp/img.png"
-        image.save(img_path)
-        # Talking head
-        client = Client(TALKING_HEAD)
         result = client.predict(
             image_path=handle_file(img_path),
             audio_path=handle_file(audio_path),
-            guidance_scale=2.0,
-            steps=8,
             api_name="/process_image_audio"
         )
-        video_path = result[0] if isinstance(result, (list, tuple)) else result
-        return video_path, "Бейне дайын!"
     except Exception as e:
         traceback.print_exc()
-        return None, f"Қате: {e}"
-# === Интерфейс ===
-with gr.Blocks(title="Бейне-лектор қазақша") as app:
-    gr.Markdown("# Бейне-лектор қазақша\nФото + текст → говорящий видео-лектор")
     with gr.Row():
-        with gr.Column():
-            img_in = gr.Image(label="Фото лектора", type="pil")
-            txt_in = gr.Textbox(label="Текст лекции (русский)", lines=6, placeholder="Привет! Сегодня мы изучаем математику…")
-            btn = gr.Button("Сделать видео", variant="primary")
-        with gr.Column():
-            video_out = gr.Video(label="Готовое видео")
-            status = gr.Textbox(label="Статус", interactive=False)
-    btn.click(create_video, [img_in, txt_in], [video_out, status])
-app.launch(server_name="0.0.0.0", server_port=7860)

 import os
 from PIL import Image
 import tempfile
+from gradio_client import Client, handle_file
 import torch
 from transformers import VitsModel, AutoTokenizer
 import scipy.io.wavfile as wavfile
 import traceback
+import base64
+import random
+import numpy as np  # Фикс для модели (np.prod)
+# Принудительно CPU и минимальное использование памяти
+os.environ['CUDA_VISIBLE_DEVICES'] = ''
+torch.set_num_threads(2) # Ограничение потоков CPU
+device = "cpu"
+print(f"Using device: {device} (optimized mode)")
+# Глобальные переменные
+tts_model = None
+tts_tokenizer = None
+TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
+def load_tts_model():
+    """Загрузка только TTS модели"""
+    global tts_model, tts_tokenizer
+    if tts_model is None:
+        print("Загрузка TTS модели (казахский)...")
+        tts_model = VitsModel.from_pretrained(
+            "facebook/mms-tts-kaz",
+            torch_dtype=torch.float32,
+            low_cpu_mem_usage=True
+        )
+        tts_model.eval() # Режим инференса
+        tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
+        print("✓ TTS модель загружена")
     return True
+def simple_translate_to_kazakh(russian_text):
+    """
+    Упрощенная транслитерация/перевод без тяжелых моделей
+    Для реального использования нужна легкая модель или API
+    """
+    # Простая замена для базовых слов (демо)
+    translations = {
+        'привет': 'сәлем',
+        'здравствуйте': 'сәлеметсіздер ме',
+        'спасибо': 'рахмет',
+        'пожалуйста': 'өтінемін',
+        'да': 'иә',
+        'нет': 'жоқ',
+        'сегодня': 'бүгін',
+        'завтра': 'ертең',
+        'математика': 'математика',
+        'физика': 'физика',
+        'урок': 'сабақ',
+        'лекция': 'дәріс',
+        'студент': 'студент',
+        'учитель': 'мұғалім',
+        'школа': 'мектеп',
+        'университет': 'университет',
+        'знание': 'білім',
+        'книга': 'кітап',
+        'вопрос': 'сұрақ',
+        'ответ': 'жауап'
     }
+    text_lower = russian_text.lower()
+    result = russian_text
+    for ru, kk in translations.items():
+        result = result.replace(ru, kk)
+        result = result.replace(ru.capitalize(), kk.capitalize())
+    return result
+def inference(image: Image.Image, text: str):
+    error_msg = ""
+    video_path = None
+    audio_path = None
+    img_path = None
     try:
+        # Загрузка TTS
+        if not load_tts_model():
+            raise RuntimeError("Не удалось загрузить TTS модель")
+        # Валидация
+        if image is None:
+            raise ValueError("Загрузите изображение лектора!")
+        if not text or not text.strip():
+            raise ValueError("Введите текст лекции!")
+        if len(text) > 500:
+            raise ValueError("Текст слишком длинный! Максимум 500 символов.")
+        print(f"Входной текст: '{text[:50]}...'")
+        # Простой перевод на казахский
+        translated_text = simple_translate_to_kazakh(text)
+        print(f"Переведенный текст: '{translated_text[:50]}...'")
+        # Генерация аудио с оптимизацией памяти
+        print("Генерация аудио...")
         with torch.no_grad():
+            inputs = tts_tokenizer(translated_text, return_tensors="pt", truncation=True, max_length=512)
+            # Освобождение памяти перед генерацией
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            output = tts_model(**inputs)
+            waveform = output.waveform.squeeze().cpu().numpy()
+            # Очистка
+            del inputs, output
+        if waveform.size == 0:
+            raise ValueError("TTS сгенерировал пустое аудио!")
+        # Сохранение аудио
+        audio = (waveform * 32767).astype("int16")
+        sampling_rate = tts_model.config.sampling_rate
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as audio_file:
+            wavfile.write(audio_file.name, sampling_rate, audio)
+            audio_path = audio_file.name
+        print(f"✓ Аудио: {audio_path} ({len(waveform)/sampling_rate:.1f} сек)")
+        # Оптимизация изображения
+        print("Обработка изображения...")
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Уменьшаем размер если слишком большое (экономия памяти)
+        max_size = 1024
+        if max(image.size) > max_size:
+            ratio = max_size / max(image.size)
+            new_size = tuple(int(dim * ratio) for dim in image.size)
+            image = image.resize(new_size, Image.Resampling.LANCZOS)
+            print(f"Изображение уменьшено до {new_size}")
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as img_file:
+            image.save(img_file.name, format='PNG', optimize=True)
+            img_path = img_file.name
+        print(f"✓ Изображение: {img_path}")
+        # Вызов Talking Head API
+        print(f"Подключение к {TALKING_HEAD_SPACE}...")
+        client = Client(TALKING_HEAD_SPACE, verbose=False)
         result = client.predict(
             image_path=handle_file(img_path),
             audio_path=handle_file(audio_path),
+            guidance_scale=2.5, # Снижено для скорости
+            steps=8, # Меньше шагов = быстрее
             api_name="/process_image_audio"
         )
+        # Обработка результата
+        if isinstance(result, tuple) and len(result) > 0:
+            video_data = result[0]
+            if isinstance(video_data, dict):
+                video_path = video_data.get('video') or video_data.get('path')
+            elif isinstance(video_data, str):
+                video_path = video_data
+            else:
+                video_path = str(video_data)
+        elif isinstance(result, str):
+            video_path = result
+        else:
+            raise ValueError("Неизвестный формат результата от API")
+        if not video_path or not os.path.exists(video_path):
+            raise ValueError("Видео не сгенерировано!")
+        print(f"✓ Видео: {video_path}")
+        error_msg = "✅ Бейне сәтті жасалды!"
     except Exception as e:
+        error_msg = f"❌ Қате: {str(e)}"
+        print(f"ОШИБКА: {error_msg}")
         traceback.print_exc()
+    finally:
+        # Очистка временных файлов
+        for path in [audio_path, img_path]:
+            if path and os.path.exists(path):
+                try:
+                    os.remove(path)
+                except:
+                    pass
+    return video_path, error_msg
+def generate_interactive_lesson(text, video_path):
+    """Упрощенная версия без тяжелых моделей QA"""
+    try:
+        if not video_path or not os.path.exists(video_path):
+            return "<p style='color: red;'>❌ Алдымен бейнені жасаңыз!</p>"
+        # Простая генерация вопросов без ML моделей
+        sentences = text.split('.')[:3] # Первые 3 предложения
+        questions = []
+        for i, sent in enumerate(sentences):
+            sent = sent.strip()
+            if len(sent) < 10:
+                continue
+            # Простые шаблоны вопросов
+            words = sent.split()
+            if len(words) < 3:
+                continue
+            # Генерируем вопрос на основе шаблона
+            question_templates = [
+                f"Не сказано о {words[0].lower()}?",
+                f"Что упоминается в тексте о {words[1].lower() if len(words) > 1 else 'теме'}?",
+                f"Какая информация дана о {words[2].lower() if len(words) > 2 else 'содержании'}?"
+            ]
+            question = random.choice(question_templates)
+            # Правильный ответ - часть предложения
+            correct = ' '.join(words[:min(5, len(words))])
+            # Неправильные ответы
+            wrong_options = [
+                "Бұл туралы айтылмаған",
+                "Мәтінде жоқ",
+                "Дұрыс емес жауап"
+            ]
+            wrong = random.choice(wrong_options)
+            questions.append({
+                "question": question,
+                "correct": correct,
+                "wrong": wrong
+            })
+        if not questions:
+            # Создаем хотя бы один вопрос
+            questions.append({
+                "question": "Дәрістің негізгі тақырыбы не?",
+                "correct": text.split('.')[0][:50] if text else "Білім",
+                "wrong": "Спорт туралы"
+            })
+        # Base64 видео (оптимизировано)
+        print("Кодирование видео в base64...")
+        with open(video_path, 'rb') as f:
+            video_data = f.read()
+            # Проверка размера
+            if len(video_data) > 50 * 1024 * 1024: # 50MB
+                return "<p style='color: orange;'>⚠️ Видео слишком большое для встраивания. Скачайте его отдельно.</p>"
+            video_base64 = base64.b64encode(video_data).decode('utf-8')
+        # Минимальный HTML
+        html = f"""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Интерактивті сабақ</title>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 15px; background: #f5f5f5; }}
+        h1 {{ color: #333; text-align: center; margin: 20px 0; font-size: 24px; }}
+        video {{ width: 100%; max-width: 600px; display: block; margin: 20px auto; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); }}
+        .text {{ background: white; padding: 15px; margin: 20px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
+        .q {{ background: white; padding: 15px; margin: 15px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
+        button {{ background: #4CAF50; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer; margin-top: 10px; }}
+        button:hover {{ background: #45a049; }}
+        .fb {{ margin-top: 10px; padding: 8px; border-radius: 5px; font-weight: bold; }}
+        label {{ cursor: pointer; }}
+    </style>
+</head>
+<body>
+    <h1>📚 Интерактивті сабақ</h1>
+    <video controls><source src="data:video/mp4;base64,{video_base64}" type="video/mp4"></video>
+    <div class="text"><strong>Дәріс мәтіні:</strong> {text[:500]}</div>
+    <h2 style="text-align:center; margin: 20px 0;">Тесттер:</h2>
+"""
+        for i, q in enumerate(questions):
+            ca = q['correct'].replace("'", "\\'").replace('"', '&quot;')
+            html += f"""
+    <div class="q">
+        <p><strong>Сұрақ {i+1}:</strong> {q['question']}</p>
+        <div style="margin: 10px 0;">
+            <input type="radio" name="q{i}" value="c" id="c{i}">
+            <label for="c{i}">{q['correct']}</label><br>
+            <input type="radio" name="q{i}" value="w" id="w{i}">
+            <label for="w{i}">{q['wrong']}</label>
+        </div>
+        <button onclick="check({i},'{ca}')">Тексеру</button>
+        <div class="fb" id="fb{i}"></div>
+    </div>
+"""
+        html += """
+    <script>
+    function check(i, c) {
+        var s = document.query_selector('input[name="q'+i+'"]:checked');
+        var f = document.getElementById('fb'+i);
+        if(!s) { f.innerHTML='⚠️ Жауап таңдаңыз!'; f.style.background='#fff3cd'; f.style.color='#856404'; return; }
+        if(s.value==='c') { f.innerHTML='✅ Дұрыс!'; f.style.background='#d4edda'; f.style.color='#155724'; }
+        else { f.innerHTML='❌ Қате. Дұрыс: '+c; f.style.background='#f8d7da'; f.style.color='#721c24'; }
+    }
+    </script>
+</body>
+</html>"""
+        escaped = html.replace('\\', '\\\\').replace('`', '\\`').replace('${', '\\${')
+        return f"""
+<div style="text-align:center; padding: 20px; background: white; border-radius: 8px;">
+    <h3 style="color: #2c3e50;">✅ Интерактивті сабақ дайын!</h3>
+    <button onclick="var w=window.open('','_blank');w.document.write(`{escaped}`);w.document.close();"
+            style="background: #27ae60; color: white; padding: 15px 30px; font-size: 16px; border: none;
+            border-radius: 8px; cursor: pointer; margin-top: 15px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
+        📖 Интерактивті сабақты ашу
+    </button>
+</div>
+"""
+    except Exception as e:
+        traceback.print_exc()
+        return f"<p style='color: red;'>❌ Қате: {str(e)}</p>"
+# Интерфейс
+with gr.Blocks(theme=gr.themes.Soft(), title="Бейне Оқытушы", css="""
+    .gradio-container {max-width: 1200px !important;}
+    footer {display: none !important;}
+""") as iface:
+    gr.Markdown("""
+    # 🎓 Бейне Оқытушы (CPU Оптимизацияланған)
+    **Қалай пайдалану:**
+    1. 📸 Суретіңізді жүктеңіз (бет анық көрінетін)
+    2. 📝 Дәріс мәтінін орыс тілінде енгізіңіз (500 таңбаға дейін)
+    3. 🎬 "Бейнені жасау" батырмасын басыңыз
+    4. 📚 Дайын болғаннан кейін "Интерактивті сабақ" жасай аласыз
+    ⚡ **Ескерту:** CPU режимінде жұмыс істейді, генерация 1-3 минут алуы мүмкін.
+    """)
     with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="📸 Дәріскер суреті")
+            text_input = gr.Textbox(
+                lines=6,
+                placeholder="Мысалы: Сәлеметсіздер ме! Бүгін біз математика туралы сөйлесеміз...",
+                label="📝 Дәріс мәтіні (орыс тілінде)"
+            )
+            generate_btn = gr.Button("🎬 Бейнені жасау", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            video_output = gr.Video(label="🎬 Дайын бейне")
+            status = gr.Textbox(label="ℹ️ Мәртебе", interactive=False)
+    interactive_btn = gr.Button("📚 Интерактивті сабақ жасау", visible=False, variant="secondary")
+    lesson_output = gr.HTML(value="", label="Интерактивті сабақ", visible=False)
+    def show_lesson_btn(video, status_msg):
+        return gr.update(visible=bool(video and "✅" in status_msg))
+    generate_btn.click(
+        inference,
+        inputs=[image_input, text_input],
+        outputs=[video_output, status]
+    ).then(
+        show_lesson_btn,
+        inputs=[video_output, status],
+        outputs=interactive_btn
+    )
+    interactive_btn.click(
+        generate_interactive_lesson,
+        inputs=[text_input, video_output],
+        outputs=lesson_output
+    ).then(
+        lambda: gr.update(visible=True),
+        outputs=lesson_output
+    )
+if __name__ == "__main__":
+    iface.launch(
+        server_name="0.0.0.0",
+        server_port=7860
+    )