AlserFurma commited on
Commit
a0c5931
·
verified ·
1 Parent(s): 3456931

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -131
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # Полная исправленная версия app.py
2
-
3
  import gradio as gr
4
  import os
5
  from PIL import Image
@@ -10,51 +8,48 @@ from transformers import VitsModel, AutoTokenizer, pipeline
10
  import scipy.io.wavfile as wavfile
11
  import traceback
12
  import random
 
13
 
14
  # =========================
15
- # Параметры
16
  # =========================
17
  TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
18
-
19
  device = "cuda" if torch.cuda.is_available() else "cpu"
20
- print(f"Using device: {device}")
21
 
22
  # =========================
23
- # Загрузка моделей
24
  # =========================
25
  try:
26
- # TTS модель (казахский)
27
  tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
28
  tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
29
 
30
- # Перевод ru -> kk
31
  translator = pipeline(
32
  "translation",
33
  model="facebook/nllb-200-distilled-600M",
34
  device=0 if device == "cuda" else -1
35
  )
36
 
37
- # Модель генерации вопросов
38
  qa_model = pipeline(
39
  "text2text-generation",
40
  model="google/flan-t5-small",
41
  device=0 if device == "cuda" else -1
42
  )
43
 
44
- print("✅ Все модели успешно загружены!")
45
 
46
  except Exception as e:
47
- raise RuntimeError(f"Ошибка загрузки моделей: {str(e)}")
48
 
49
 
50
  # =========================
51
- # Вспомогательные функции
52
  # =========================
53
 
54
  def generate_quiz(text: str):
55
  prompt = (
56
- "Сгенерируй один учебный вопрос по этому тексту и дай 1 правильный и 1 неправильный вариант.\n"
57
- трого используй такой формат (каждая часть с новой строки):\n"
58
  "QUESTION: ...\n"
59
  "CORRECT: ...\n"
60
  "WRONG: ...\n"
@@ -66,44 +61,41 @@ def generate_quiz(text: str):
66
  except Exception as e:
67
  raise RuntimeError(f"Ошибка генерации вопроса: {e}")
68
 
69
- # Унифицируем текст
70
- data = out.replace("\r", "")
71
-
72
- # --- Пытаемся достать через регулярные выражения ---
73
- q = re.search(r"QUESTION:\s*(.+)", data, re.IGNORECASE)
74
- c = re.search(r"CORRECT:\s*(.+)", data, re.IGNORECASE)
75
- w = re.search(r"WRONG:\s*(.+)", data, re.IGNORECASE)
76
 
77
  question = q.group(1).strip() if q else ""
78
  correct = c.group(1).strip() if c else ""
79
  wrong = w.group(1).strip() if w else ""
80
 
81
- # --- Если пусто — fallback парсер ---
82
  if not (question and correct and wrong):
83
- lines = [ln.strip() for ln in data.split('\n') if ln.strip()]
84
- for ln in lines:
85
- if ln.lower().startswith("question"):
86
- question = ln.split(":", 1)[1].strip()
87
- elif ln.lower().startswith("correct"):
88
- correct = ln.split(":", 1)[1].strip()
89
- elif ln.lower().startswith("wrong"):
90
- wrong = ln.split(":", 1)[1].strip()
91
-
92
- # --- Если всё ещё пусто — ошибка ---
93
  if not (question and correct and wrong):
94
- raise ValueError(
95
- f"Модель вывела неподходящий формат:\n---\n{out}\n---"
96
- )
97
 
98
- # Случайно перемешать варианты
99
  options = [correct, wrong]
100
  random.shuffle(options)
101
-
102
  return question, options, correct
103
 
104
 
 
 
 
 
105
  def synthesize_audio(text_ru: str):
106
- """Переводит русскую строку на казахский, синтезирует аудио и возвращает путь к файлу."""
107
  translation = translator(text_ru, src_lang="rus_Cyrl", tgt_lang="kaz_Cyrl")
108
  text_kk = translation[0]["translation_text"]
109
 
@@ -112,20 +104,22 @@ def synthesize_audio(text_ru: str):
112
  output = tts_model(**inputs)
113
 
114
  waveform = output.waveform.squeeze().cpu().numpy()
115
- if waveform.size == 0:
116
- raise ValueError("TTS вернул пустое аудио")
117
 
118
- audio = (waveform * 32767).astype('int16')
119
- sampling_rate = getattr(tts_model.config, 'sampling_rate', 22050)
 
 
120
 
121
- tmpf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
122
- wavfile.write(tmpf.name, sampling_rate, audio)
123
- tmpf.close()
124
- return tmpf.name
125
 
 
 
 
126
 
127
  def make_talking_head(image_path: str, audio_path: str):
128
  client = Client(TALKING_HEAD_SPACE)
 
129
  try:
130
  result = client.predict(
131
  image_path=handle_file(image_path),
@@ -135,144 +129,124 @@ def make_talking_head(image_path: str, audio_path: str):
135
  api_name="/process_image_audio"
136
  )
137
  except Exception as e:
138
- raise RuntimeError(f"Ошибка Talking Head API: {e}")
139
 
140
- video_path = None
 
141
 
142
- if isinstance(result, tuple) and len(result) > 0:
143
- video_data = result[0]
144
  else:
145
- video_data = result
146
-
147
- if isinstance(video_data, dict):
148
- video_path = video_data.get("video") or video_data.get("path") or video_data.get("file")
149
- elif isinstance(video_data, str):
150
- video_path = video_data
151
 
152
- if not video_path:
153
  raise ValueError("API не вернул путь к видео")
154
 
155
- return video_path
156
 
157
 
158
  # =========================
159
- # Логика Gradio
160
  # =========================
161
 
162
- def start_lesson(image: Image.Image, text: str, state):
163
- if image is None or not text or not text.strip() or len(text) > 500:
164
- return None, "", [], [], state
 
 
165
 
166
  try:
167
- # Генерируем вопрос
168
  question, options, correct = generate_quiz(text)
169
 
170
- quiz_ru = f"Вопрос: {question} Варианты: 1) {options[0]} 2) {options[1]}"
 
 
171
 
172
- audio_path = synthesize_audio(quiz_ru)
 
 
 
173
 
174
- # Сохраняем изображение
175
- tmpimg = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
176
- if image.mode != "RGB":
177
- image = image.convert("RGB")
178
- image.save(tmpimg.name)
179
- tmpimg.close()
180
- image_path = tmpimg.name
181
 
182
- video_path = make_talking_head(image_path, audio_path)
183
 
184
- # Стейт
185
- state_data = {
186
- "image_path": image_path,
187
  "correct": correct,
188
- "options": options
 
189
  }
190
 
191
- # Удаляем аудио
192
- if os.path.exists(audio_path):
193
- os.remove(audio_path)
194
-
195
- return video_path, question, options, state_data, state_data
196
 
197
  except Exception as e:
198
  traceback.print_exc()
199
  return None, f"Ошибка: {e}", [], [], state
200
 
201
 
202
- def answer_selected(selected_option: str, state):
203
- if not state:
204
- return None, "Ошибка: нет состояния. Нажмите 'Запустить урок'."
205
-
206
- try:
207
- correct = state.get("correct")
208
- image_path = state.get("image_path")
209
- options = state.get("options", [])
210
 
211
- if selected_option == correct:
212
- reaction_ru = "Молодец!"
213
- display_message = "Дұрыс!"
214
- else:
215
- reaction_ru = f"Неправильно. Правильный ответ: {correct}"
216
- display_message = f"Қате. Дұрыс жауап: {correct}"
217
 
218
- audio_path = synthesize_audio(reaction_ru)
219
- reaction_video = make_talking_head(image_path, audio_path)
220
 
221
- if os.path.exists(audio_path):
222
- os.remove(audio_path)
 
 
 
 
223
 
224
- return reaction_video, display_message
 
 
225
 
226
- except Exception as e:
227
- traceback.print_exc()
228
- return None, f"Ошибка: {e}"
229
 
230
 
231
  # =========================
232
- # Интерфейс Gradio
233
  # =========================
234
 
235
- title = "🎓 Интерактивный бейне-лектор"
236
 
237
  description = (
238
- "Загрузите фото лектора и текст лекции (рус., до 500 символов).<br>"
239
- "Система создаст видео-лектора, задаст вопрос и предложит 2 варианта ответа.<br>"
240
- "После выбора варианта лектор коротко ответит по-казахски."
241
  )
242
 
243
  with gr.Blocks() as demo:
244
  gr.Markdown(f"# {title}<br>{description}")
245
 
246
  with gr.Row():
247
- with gr.Column(scale=1):
248
- inp_image = gr.Image(type='pil', label="📸 Фото лектора")
249
- inp_text = gr.Textbox(lines=5, label="📝 Текст лекции (рус.)")
250
- btn_start = gr.Button("Запустить урок")
251
 
252
- with gr.Column(scale=1):
253
  out_video = gr.Video(label="🎬 Видео лектора")
254
  out_question = gr.Markdown(label="Вопрос")
 
 
 
 
255
 
256
- btn_opt1 = gr.Button("Вариант 1")
257
- btn_opt2 = gr.Button("Вариант 2")
258
 
259
- out_reaction_video = gr.Video(label="🎥 Реакция лектора")
260
- out_status = gr.Textbox(label="ℹ️ Статус", interactive=False)
261
-
262
- lesson_state = gr.State({})
263
-
264
- btn_start.click(
265
- fn=start_lesson,
266
- inputs=[inp_image, inp_text, lesson_state],
267
- outputs=[out_video, out_question, btn_opt1, btn_opt2, lesson_state]
268
- )
269
 
270
- btn_opt1.click(fn=answer_selected, inputs=[btn_opt1, lesson_state],
271
- outputs=[out_reaction_video, out_status])
272
- btn_opt2.click(fn=answer_selected, inputs=[btn_opt2, lesson_state],
273
- outputs=[out_reaction_video, out_status])
274
 
275
- demo.load(lambda: "Готово", outputs=out_status)
276
 
277
  if __name__ == "__main__":
278
  demo.launch()
 
 
 
1
  import gradio as gr
2
  import os
3
  from PIL import Image
 
8
  import scipy.io.wavfile as wavfile
9
  import traceback
10
  import random
11
+ import re
12
 
13
  # =========================
14
+ # ПАРАМЕТРЫ
15
  # =========================
16
  TALKING_HEAD_SPACE = "Skywork/skyreels-a1-talking-head"
 
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ print("Device:", device)
19
 
20
  # =========================
21
+ # ЗАГРУЗКА МОДЕЛЕЙ
22
  # =========================
23
  try:
 
24
  tts_model = VitsModel.from_pretrained("facebook/mms-tts-kaz").to(device)
25
  tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kaz")
26
 
 
27
  translator = pipeline(
28
  "translation",
29
  model="facebook/nllb-200-distilled-600M",
30
  device=0 if device == "cuda" else -1
31
  )
32
 
 
33
  qa_model = pipeline(
34
  "text2text-generation",
35
  model="google/flan-t5-small",
36
  device=0 if device == "cuda" else -1
37
  )
38
 
39
+ print("Модели успешно загружены!")
40
 
41
  except Exception as e:
42
+ raise RuntimeError(f"Ошибка при загрузке моделей: {e}")
43
 
44
 
45
  # =========================
46
+ # ГЕНЕРАЦИЯ ВОПРОСА
47
  # =========================
48
 
49
  def generate_quiz(text: str):
50
  prompt = (
51
+ "Сгенерируй учебный вопрос по тексту и дай 1 правильный и 1 неправильный вариант ответа.\n"
52
+ ТРОГО ИСПОЛЬЗУЙ ФОРМАТ:\n"
53
  "QUESTION: ...\n"
54
  "CORRECT: ...\n"
55
  "WRONG: ...\n"
 
61
  except Exception as e:
62
  raise RuntimeError(f"Ошибка генерации вопроса: {e}")
63
 
64
+ text_out = out.replace("\r", "").strip()
65
+
66
+ # --- Regular expressions ---
67
+ q = re.search(r"QUESTION:\s*(.+)", text_out, re.IGNORECASE)
68
+ c = re.search(r"CORRECT:\s*(.+)", text_out, re.IGNORECASE)
69
+ w = re.search(r"WRONG:\s*(.+)", text_out, re.IGNORECASE)
 
70
 
71
  question = q.group(1).strip() if q else ""
72
  correct = c.group(1).strip() if c else ""
73
  wrong = w.group(1).strip() if w else ""
74
 
75
+ # --- fallback ---
76
  if not (question and correct and wrong):
77
+ lines = [l.strip() for l in text_out.split("\n") if l.strip()]
78
+ for l in lines:
79
+ if l.lower().startswith("question"):
80
+ question = l.split(":", 1)[1].strip()
81
+ elif l.lower().startswith("correct"):
82
+ correct = l.split(":", 1)[1].strip()
83
+ elif l.lower().startswith("wrong"):
84
+ wrong = l.split(":", 1)[1].strip()
85
+
 
86
  if not (question and correct and wrong):
87
+ raise ValueError(f"Модель вывела неправильный формат:\n{out}")
 
 
88
 
 
89
  options = [correct, wrong]
90
  random.shuffle(options)
 
91
  return question, options, correct
92
 
93
 
94
+ # =========================
95
+ # АУДИО НА КАЗАХСКОМ
96
+ # =========================
97
+
98
  def synthesize_audio(text_ru: str):
 
99
  translation = translator(text_ru, src_lang="rus_Cyrl", tgt_lang="kaz_Cyrl")
100
  text_kk = translation[0]["translation_text"]
101
 
 
104
  output = tts_model(**inputs)
105
 
106
  waveform = output.waveform.squeeze().cpu().numpy()
107
+ audio = (waveform * 32767).astype("int16")
108
+ sr = getattr(tts_model.config, "sampling_rate", 22050)
109
 
110
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
111
+ wavfile.write(tmp.name, sr, audio)
112
+ tmp.close()
113
+ return tmp.name
114
 
 
 
 
 
115
 
116
+ # =========================
117
+ # TALKING HEAD
118
+ # =========================
119
 
120
  def make_talking_head(image_path: str, audio_path: str):
121
  client = Client(TALKING_HEAD_SPACE)
122
+
123
  try:
124
  result = client.predict(
125
  image_path=handle_file(image_path),
 
129
  api_name="/process_image_audio"
130
  )
131
  except Exception as e:
132
+ raise RuntimeError(f"Ошибка вызова Talking Head API: {e}")
133
 
134
+ if isinstance(result, tuple):
135
+ result = result[0]
136
 
137
+ if isinstance(result, dict):
138
+ video = result.get("video") or result.get("file") or result.get("path")
139
  else:
140
+ video = result
 
 
 
 
 
141
 
142
+ if not video:
143
  raise ValueError("API не вернул путь к видео")
144
 
145
+ return video
146
 
147
 
148
  # =========================
149
+ # GRADIO — ШАГ 1
150
  # =========================
151
 
152
+ def start_lesson(image, text, state):
153
+ if image is None:
154
+ return None, "Загрузите фото", [], [], state
155
+ if not text or len(text) > 500:
156
+ return None, "Введите текст (до 500 символов)", [], [], state
157
 
158
  try:
 
159
  question, options, correct = generate_quiz(text)
160
 
161
+ quiz_ru = f"Вопрос: {question}. Варианты: 1) {options[0]}, 2) {options[1]}"
162
+
163
+ audio = synthesize_audio(quiz_ru)
164
 
165
+ tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
166
+ image.convert("RGB").save(tmp.name)
167
+ image_path = tmp.name
168
+ tmp.close()
169
 
170
+ video = make_talking_head(image_path, audio)
 
 
 
 
 
 
171
 
172
+ os.remove(audio)
173
 
174
+ state_new = {
 
 
175
  "correct": correct,
176
+ "options": options,
177
+ "image_path": image_path
178
  }
179
 
180
+ return video, question, options, state_new, state_new
 
 
 
 
181
 
182
  except Exception as e:
183
  traceback.print_exc()
184
  return None, f"Ошибка: {e}", [], [], state
185
 
186
 
187
+ # =========================
188
+ # GRADIO — ШАГ 2
189
+ # =========================
 
 
 
 
 
190
 
191
+ def answer_selected(selected, state):
192
+ if not state:
193
+ return None, "Ошибка: урок не запущен."
 
 
 
194
 
195
+ correct = state["correct"]
196
+ image_path = state["image_path"]
197
 
198
+ if selected == correct:
199
+ text_ru = "Молодец!"
200
+ message = "Дұрыс!"
201
+ else:
202
+ text_ru = f"Неправильно. Правильный ответ: {correct}"
203
+ message = f"Қате. Дұрыс жауап: {correct}"
204
 
205
+ audio = synthesize_audio(text_ru)
206
+ video = make_talking_head(image_path, audio)
207
+ os.remove(audio)
208
 
209
+ return video, message
 
 
210
 
211
 
212
  # =========================
213
+ # UI
214
  # =========================
215
 
216
+ title = "🎓 Интерактивный видео-лектор"
217
 
218
  description = (
219
+ "Загрузите фото и текст (рус.).<br>"
220
+ "Лектор задаст вопрос и предложит варианты.<br>"
221
+ "После выбора — ответит по-казахски."
222
  )
223
 
224
  with gr.Blocks() as demo:
225
  gr.Markdown(f"# {title}<br>{description}")
226
 
227
  with gr.Row():
228
+ with gr.Column():
229
+ img = gr.Image(type='pil', label="📸 Фото лектора")
230
+ txt = gr.Textbox(lines=5, label="📝 Текст лекции (до 500 символов)")
231
+ btn = gr.Button("Запустить урок")
232
 
233
+ with gr.Column():
234
  out_video = gr.Video(label="🎬 Видео лектора")
235
  out_question = gr.Markdown(label="Вопрос")
236
+ opt1 = gr.Button("Вариант 1")
237
+ opt2 = gr.Button("Вариант 2")
238
+ react_video = gr.Video(label="🎥 Реакция")
239
+ status = gr.Textbox(label="Статус", interactive=False)
240
 
241
+ state = gr.State({})
 
242
 
243
+ btn.click(start_lesson, [img, txt, state],
244
+ [out_video, out_question, opt1, opt2, state])
 
 
 
 
 
 
 
 
245
 
246
+ opt1.click(answer_selected, [opt1, state], [react_video, status])
247
+ opt2.click(answer_selected, [opt2, state], [react_video, status])
 
 
248
 
249
+ demo.load(lambda: "Готово", outputs=status)
250
 
251
  if __name__ == "__main__":
252
  demo.launch()