AndreySokolov01's picture
Update app.py
23f1f1e verified
import os
import subprocess
import tempfile
import traceback
import time
import shutil
import torch
import gradio as gr
from textwrap import wrap
from nemo.collections.asr.models import EncDecRNNTBPEModel
from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
from faster_whisper import WhisperModel as FasterWhisperModel
ASR_BACKENDS = ["Whisper", "FasterWhisper", "NeMoParakeet"]
WHISPER_MODEL = "antony66/whisper-large-v3-russian"
FASTERW_MODEL = "Ash8181/whisper-large-v3-russian-ct2"
PARAKEET_MODEL = "nvidia/parakeet-tdt-0.6b-v3"
CHUNK_LENGTH = 307
PRESETS = {
"Viral Shorts (TOP)": {
"Alignment": 8,
"FontName": "Arial Black",
"FontSize": 64,
"PrimaryColour": "#FFFF00",
"Outline": 4,
"OutlineColour": "#000000",
"Shadow": 1,
"BackColour": "#80000000",
"Bold": 1,
"BorderStyle": 1,
"MarginV": 40,
},
"Минимал низ": {
"Alignment": 2,
"FontName": "Montserrat",
"FontSize": 28,
"PrimaryColour": "#17FC03",
"Outline": 1,
"OutlineColour": "#000000",
"Shadow": 0,
"BackColour": "#80000000",
"Bold": 1,
"BorderStyle": 1,
"MarginV": 40,
},
}
_cache = {}
def format_srt_time(sec):
h = int(sec // 3600)
m = int((sec % 3600) // 60)
s = int(sec % 60)
ms = int((sec - int(sec)) * 1000)
return f"{h:02}:{m:02}:{s:02},{ms:03}"
def color_hex_to_ass(val):
if isinstance(val, str) and val.startswith("#"):
val = val.lstrip("#")
if len(val) == 6:
val = "00" + val
aa, rr, gg, bb = val[:2], val[2:4], val[4:6], val[6:8]
return f"&H{aa}{bb}{gg}{rr}"
return val
def style_to_force(style):
parts = []
for k, v in style.items():
if "Colour" in k:
v = color_hex_to_ass(v)
parts.append(f"{k}={v}")
return ",".join(parts)
def format_subtitle_text(text, font_size, bold):
text = text.strip()
if bold and font_size >= 48:
text = text.upper()
else:
text = text.capitalize()
# Адаптивная ширина переноса
if font_size >= 60:
wrap_width = 12
elif font_size >= 48:
wrap_width = 16
elif font_size >= 36:
wrap_width = 24
else:
wrap_width = 36
return "\n".join(wrap(text, wrap_width))
# === Загрузка моделей ===
def get_whisper():
if "whisper" in _cache:
return _cache["whisper"]
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
processor = WhisperProcessor.from_pretrained(WHISPER_MODEL)
model = WhisperForConditionalGeneration.from_pretrained(
WHISPER_MODEL, torch_dtype=dtype, low_cpu_mem_usage=True
)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
return_timestamps=True,
chunk_length_s=CHUNK_LENGTH,
device=0 if device == "cuda" else -1,
)
_cache["whisper"] = pipe
return pipe
def get_faster_whisper():
if "faster" in _cache:
return _cache["faster"]
device = "cuda" if torch.cuda.is_available() else "cpu"
compute = "float16" if device == "cuda" else "int8"
model = FasterWhisperModel(FASTERW_MODEL, device=device, compute_type=compute)
_cache["faster"] = model
return model
def get_parakeet():
if "parakeet" in _cache:
return _cache["parakeet"]
model = EncDecRNNTBPEModel.from_pretrained(PARAKEET_MODEL)
model.eval()
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
_cache["parakeet"] = model
return model
# === Транскрибация ===
def transcribe(audio, backend):
if backend == "Whisper":
pipe = get_whisper()
res = pipe(audio, generate_kwargs={"language": "russian"})
chunks = [{"start": c["timestamp"][0], "end": c["timestamp"][1], "text": c["text"]} for c in res["chunks"]]
return chunks
if backend == "FasterWhisper":
model = get_faster_whisper()
segs, _ = model.transcribe(audio, language="ru")
return [{"start": s.start, "end": s.end, "text": s.text} for s in segs]
model = get_parakeet()
out = model.transcribe([audio], timestamps=True)[0].timestamp["word"]
chunks = []
step = 6
for i in range(0, len(out), step):
g = out[i:i+step]
chunks.append({
"start": g[0]["start"],
"end": g[-1]["end"],
"text": " ".join(w["word"] for w in g)
})
return chunks
# === Превью стиля (без транскрибации!) ===
def preview_subtitle_style(video_path, font, size, color, bg, bold, margin):
if not video_path:
return None
tmp = tempfile.mkdtemp()
frame = os.path.join(tmp, "frame.jpg")
ass_file = os.path.join(tmp, "preview.ass")
try:
# Извлекаем первый кадр
subprocess.run([
'ffmpeg', '-y', '-i', video_path, '-vframes', '1', '-q:v', '2', frame
], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if not os.path.exists(frame):
return None
# Фиксированный текст для превью
example_text = "Тут ваши субтитры"
styled_text = format_subtitle_text(example_text, size, bold)
# Стиль
style = {
"FontName": font,
"FontSize": int(size),
"PrimaryColour": color,
"BackColour": bg,
"Bold": int(bold),
"MarginV": int(margin),
"Alignment": 2,
"Outline": 1,
"OutlineColour": "&H00000000",
"BorderStyle": 1,
"Shadow": 0,
}
style_str = style_to_force(style)
# Создаём .ass
with open(ass_file, "w", encoding="utf-8") as f:
f.write("[Script Info]\n")
f.write("ScriptType: v4.00+\n")
f.write("PlayResX: 1920\n")
f.write("PlayResY: 1080\n\n")
f.write("[V4+ Styles]\n")
f.write("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n")
f.write(f"Style: Default,{style_str}\n\n")
f.write("[Events]\n")
f.write("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n")
f.write(f"Dialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,{styled_text}\n")
# Накладываем на кадр
preview_img = os.path.join(tmp, "preview.jpg")
safe_ass = ass_file.replace("\\", "/").replace(":", "\\:")
subprocess.run([
'ffmpeg', '-y', '-i', frame, '-vf', f"ass='{safe_ass}'", preview_img
], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if os.path.exists(preview_img):
return preview_img
except Exception as e:
print("Preview error:", e)
traceback.print_exc()
return None
# === Полная обработка видео ===
def process(video, backend, preset, font, size, color, bg, bold, margin):
if not video:
return "❌ Нет видео", None, None, "", None
tmp = tempfile.mkdtemp()
wav = os.path.join(tmp, "audio.wav")
try:
subprocess.run(['ffmpeg', '-y', '-i', video, '-vn', '-ac', '1', '-ar', '16000', wav],
check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
segs = transcribe(wav, backend)
if not segs:
return "❌ Нет речи", None, None, "", None
style = PRESETS[preset].copy()
style.update({
"FontName": font,
"FontSize": int(size),
"PrimaryColour": color,
"BackColour": bg,
"Bold": int(bold),
"MarginV": int(margin),
})
preview_text = ""
for i, s in enumerate(segs, 1):
formatted = format_subtitle_text(s["text"], size, bold)
preview_text += f"{i}. {formatted.replace(chr(10), ' / ')}\n"
srt = os.path.join(tmp, "subs.srt")
with open(srt, "w", encoding="utf-8") as f:
for i, s in enumerate(segs, 1):
txt = format_subtitle_text(s["text"], size, bold)
f.write(f"{i}\n{format_srt_time(s['start'])} --> {format_srt_time(s['end'])}\n{txt}\n\n")
out = f"result_{int(time.time())}.mp4"
fs = style_to_force(style)
safe_srt = srt.replace("\\", "/").replace(":", "\\:")
vf = f"subtitles='{safe_srt}':force_style='{fs}'"
subprocess.run(['ffmpeg', '-y', '-i', video, '-vf', vf, '-c:a', 'copy', out],
check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return "✅ Готово", out, srt, preview_text, None
except Exception as e:
return f"❌ Ошибка: {str(e)}", None, None, "", None
# === Интерфейс Gradio ===
with gr.Blocks() as demo:
gr.Markdown("## 🎬 Автосубтитры (Whisper / FasterWhisper / NeMo) + LIVE preview + Превью стиля")
with gr.Row():
with gr.Column():
video = gr.Video(label="Видео")
backend = gr.Dropdown(ASR_BACKENDS, value="Whisper", label="ASR")
preset = gr.Dropdown(list(PRESETS.keys()), value=list(PRESETS.keys())[0], label="Пресет")
gr.Markdown("### 🎨 Ручная настройка")
font = gr.Textbox("Montserrat", label="Шрифт")
size = gr.Slider(minimum=10, maximum=96, value=32, step=1, label="Размер шрифта")
color = gr.ColorPicker("#FFFFFF", label="Цвет текста")
bg = gr.ColorPicker("#80000000", label="Фон")
bold = gr.Checkbox(True, label="Bold")
margin = gr.Slider(10, 100, 40, label="Отступ снизу")
with gr.Row():
run_btn = gr.Button("🚀 Сгенерировать субтитры")
preview_btn = gr.Button("👁️ Превью стиля")
with gr.Column():
status = gr.Markdown()
preview = gr.Textbox(label="LIVE preview текста субтитров", lines=8)
preview_img = gr.Image(label="Превью стиля на кадре", type="filepath")
out_video = gr.Video(label="Видео с субтитрами")
out_srt = gr.File(label="SRT файл")
run_btn.click(
process,
inputs=[video, backend, preset, font, size, color, bg, bold, margin],
outputs=[status, out_video, out_srt, preview, preview_img]
)
preview_btn.click(
preview_subtitle_style,
inputs=[video, font, size, color, bg, bold, margin],
outputs=[preview_img]
)
demo.queue().launch()