Update app.py
Browse files
app.py
CHANGED
|
@@ -1,67 +1,95 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
import torch
|
| 3 |
import gradio as gr
|
| 4 |
-
import whisperx
|
| 5 |
-
from whisperx.diarize import DiarizationPipeline
|
| 6 |
|
| 7 |
# 1. Устройство
|
| 8 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 9 |
|
| 10 |
-
# 2.
|
|
|
|
| 11 |
hf_token = os.getenv("HF_TOKEN", None)
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
"
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
# 4. Инициализируем пайплайн диаризации
|
| 21 |
-
diarize_pipeline = DiarizationPipeline(use_auth_token=hf_token, device=device)
|
| 22 |
-
|
| 23 |
-
def transcribe_with_diarization(audio_path):
|
| 24 |
-
# 5. Транскрипция, жестко на русском
|
| 25 |
-
result = model.transcribe(
|
| 26 |
-
audio_path,
|
| 27 |
-
language="ru"
|
| 28 |
-
)
|
| 29 |
-
# 6. Выравнивание таймингов
|
| 30 |
-
result = whisperx.align(
|
| 31 |
-
result["segments"],
|
| 32 |
-
model,
|
| 33 |
-
audio_path,
|
| 34 |
-
device=device
|
| 35 |
)
|
| 36 |
-
|
|
|
|
| 37 |
diarization = diarize_pipeline(audio_path)
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
)
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
#
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
output = gr.Textbox(label="Результат", lines=20)
|
| 64 |
-
btn.click(transcribe_with_diarization, inputs=audio_input, outputs=output)
|
| 65 |
|
| 66 |
if __name__ == "__main__":
|
| 67 |
-
|
|
|
|
| 1 |
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
import whisperx
|
| 4 |
import torch
|
| 5 |
import gradio as gr
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# 1. Устройство
|
| 8 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 9 |
|
| 10 |
+
# 2. Загрузка моделей
|
| 11 |
+
asr_model = whisperx.load_model("small", device)
|
| 12 |
hf_token = os.getenv("HF_TOKEN", None)
|
| 13 |
+
diarize_pipeline = whisperx.DiarizationPipeline(use_auth_token=hf_token, device=device)
|
| 14 |
|
| 15 |
+
def transcribe_and_prepare(audio_path):
|
| 16 |
+
# ASR (жёстко русский)
|
| 17 |
+
result = asr_model.transcribe(audio_path, language="ru")
|
| 18 |
+
|
| 19 |
+
# Alignment
|
| 20 |
+
aligned = whisperx.align(
|
| 21 |
+
result["segments"], asr_model, audio_path, device=device
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
)
|
| 23 |
+
|
| 24 |
+
# Diarization
|
| 25 |
diarization = diarize_pipeline(audio_path)
|
| 26 |
+
segments = whisperx.diarize(aligned, diarization)
|
| 27 |
+
|
| 28 |
+
# Подготовка для UI: возвращаем список dict-ов
|
| 29 |
+
ui_data = []
|
| 30 |
+
for i, seg in enumerate(segments):
|
| 31 |
+
ui_data.append({
|
| 32 |
+
"index": i,
|
| 33 |
+
"speaker": seg["speaker"],
|
| 34 |
+
"start": f"{seg['start']:.2f}",
|
| 35 |
+
"end": f"{seg['end']:.2f}",
|
| 36 |
+
"text": seg["text"]
|
| 37 |
+
})
|
| 38 |
+
return ui_data
|
| 39 |
+
|
| 40 |
+
def generate_download(ui_data):
|
| 41 |
+
# Формируем итоговый TXT
|
| 42 |
+
lines = []
|
| 43 |
+
for row in ui_data:
|
| 44 |
+
lines.append(f"[{row['speaker']}] ({row['start']}-{row['end']}): {row['text']}")
|
| 45 |
+
txt = "\n".join(lines)
|
| 46 |
+
path = os.path.join(tempfile.gettempdir(), "transcript.txt")
|
| 47 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 48 |
+
f.write(txt)
|
| 49 |
+
return path
|
| 50 |
+
|
| 51 |
+
# 3. Интерфейс
|
| 52 |
+
with gr.Blocks(css="""
|
| 53 |
+
.gradio-container { max-width: 900px; margin: auto; }
|
| 54 |
+
@media (max-width: 600px) {
|
| 55 |
+
.gradio-container { padding: 0 10px; }
|
| 56 |
+
}
|
| 57 |
+
""") as demo:
|
| 58 |
+
|
| 59 |
+
gr.Markdown("## 🎤 Транскрибация и диаризация аудио (русский)")
|
| 60 |
+
audio_in = gr.Audio(label="Загрузите аудио", type="filepath")
|
| 61 |
+
btn = gr.Button("Запустить транскрибацию")
|
| 62 |
+
|
| 63 |
+
# Таблица сегментов для ручной правки
|
| 64 |
+
table = gr.Dataframe(
|
| 65 |
+
headers=["index","speaker","start","end","text"],
|
| 66 |
+
datatype=["number","text","text","text","text"],
|
| 67 |
+
interactive=True,
|
| 68 |
+
row_count=(1, None),
|
| 69 |
+
col_count=5,
|
| 70 |
+
wrap=True,
|
| 71 |
+
label="Сегменты (можно править спикера и текст)"
|
| 72 |
)
|
| 73 |
+
|
| 74 |
+
download_btn = gr.Button("Скачать итоговый TXT")
|
| 75 |
+
download_txt = gr.File(label="Итоговый файл")
|
| 76 |
+
|
| 77 |
+
# Связываем
|
| 78 |
+
btn.click(fn=transcribe_and_prepare, inputs=[audio_in], outputs=[table])
|
| 79 |
+
download_btn.click(fn=generate_download, inputs=[table], outputs=[download_txt])
|
| 80 |
+
|
| 81 |
+
# Плейер для выбранного сегмента
|
| 82 |
+
with gr.Row():
|
| 83 |
+
idx_in = gr.Number(value=0, label="Номер сегмента для прослушивания")
|
| 84 |
+
play_btn = gr.Button("▶️ Прослушать сегмент")
|
| 85 |
+
player = gr.Audio(label="Плеер сегмента")
|
| 86 |
+
|
| 87 |
+
def play_segment(audio_path, ui_data, idx):
|
| 88 |
+
seg = ui_data[int(idx)]
|
| 89 |
+
start, end = float(seg["start"]), float(seg["end"])
|
| 90 |
+
return {"filepath": audio_path, "start_time": start, "end_time": end}
|
| 91 |
+
|
| 92 |
+
play_btn.click(fn=play_segment, inputs=[audio_in, table, idx_in], outputs=[player])
|
|
|
|
|
|
|
| 93 |
|
| 94 |
if __name__ == "__main__":
|
| 95 |
+
demo.launch()
|