import os import subprocess import tempfile import traceback import time import shutil import torch import gradio as gr from textwrap import wrap from nemo.collections.asr.models import EncDecRNNTBPEModel from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline from faster_whisper import WhisperModel as FasterWhisperModel ASR_BACKENDS = ["Whisper", "FasterWhisper", "NeMoParakeet"] WHISPER_MODEL = "antony66/whisper-large-v3-russian" FASTERW_MODEL = "Ash8181/whisper-large-v3-russian-ct2" PARAKEET_MODEL = "nvidia/parakeet-tdt-0.6b-v3" CHUNK_LENGTH = 307 PRESETS = { "Viral Shorts (TOP)": { "Alignment": 8, "FontName": "Arial Black", "FontSize": 64, "PrimaryColour": "#FFFF00", "Outline": 4, "OutlineColour": "#000000", "Shadow": 1, "BackColour": "#80000000", "Bold": 1, "BorderStyle": 1, "MarginV": 40, }, "Минимал низ": { "Alignment": 2, "FontName": "Montserrat", "FontSize": 28, "PrimaryColour": "#17FC03", "Outline": 1, "OutlineColour": "#000000", "Shadow": 0, "BackColour": "#80000000", "Bold": 1, "BorderStyle": 1, "MarginV": 40, }, } _cache = {} def format_srt_time(sec): h = int(sec // 3600) m = int((sec % 3600) // 60) s = int(sec % 60) ms = int((sec - int(sec)) * 1000) return f"{h:02}:{m:02}:{s:02},{ms:03}" def color_hex_to_ass(val): if isinstance(val, str) and val.startswith("#"): val = val.lstrip("#") if len(val) == 6: val = "00" + val aa, rr, gg, bb = val[:2], val[2:4], val[4:6], val[6:8] return f"&H{aa}{bb}{gg}{rr}" return val def style_to_force(style): parts = [] for k, v in style.items(): if "Colour" in k: v = color_hex_to_ass(v) parts.append(f"{k}={v}") return ",".join(parts) def format_subtitle_text(text, font_size, bold): text = text.strip() if bold and font_size >= 48: text = text.upper() else: text = text.capitalize() # Адаптивная ширина переноса if font_size >= 60: wrap_width = 12 elif font_size >= 48: wrap_width = 16 elif font_size >= 36: wrap_width = 24 else: wrap_width = 36 return "\n".join(wrap(text, wrap_width)) # === Загрузка моделей === def get_whisper(): if "whisper" in _cache: return _cache["whisper"] device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 processor = WhisperProcessor.from_pretrained(WHISPER_MODEL) model = WhisperForConditionalGeneration.from_pretrained( WHISPER_MODEL, torch_dtype=dtype, low_cpu_mem_usage=True ) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, return_timestamps=True, chunk_length_s=CHUNK_LENGTH, device=0 if device == "cuda" else -1, ) _cache["whisper"] = pipe return pipe def get_faster_whisper(): if "faster" in _cache: return _cache["faster"] device = "cuda" if torch.cuda.is_available() else "cpu" compute = "float16" if device == "cuda" else "int8" model = FasterWhisperModel(FASTERW_MODEL, device=device, compute_type=compute) _cache["faster"] = model return model def get_parakeet(): if "parakeet" in _cache: return _cache["parakeet"] model = EncDecRNNTBPEModel.from_pretrained(PARAKEET_MODEL) model.eval() model = model.to("cuda" if torch.cuda.is_available() else "cpu") _cache["parakeet"] = model return model # === Транскрибация === def transcribe(audio, backend): if backend == "Whisper": pipe = get_whisper() res = pipe(audio, generate_kwargs={"language": "russian"}) chunks = [{"start": c["timestamp"][0], "end": c["timestamp"][1], "text": c["text"]} for c in res["chunks"]] return chunks if backend == "FasterWhisper": model = get_faster_whisper() segs, _ = model.transcribe(audio, language="ru") return [{"start": s.start, "end": s.end, "text": s.text} for s in segs] model = get_parakeet() out = model.transcribe([audio], timestamps=True)[0].timestamp["word"] chunks = [] step = 6 for i in range(0, len(out), step): g = out[i:i+step] chunks.append({ "start": g[0]["start"], "end": g[-1]["end"], "text": " ".join(w["word"] for w in g) }) return chunks # === Превью стиля (без транскрибации!) === def preview_subtitle_style(video_path, font, size, color, bg, bold, margin): if not video_path: return None tmp = tempfile.mkdtemp() frame = os.path.join(tmp, "frame.jpg") ass_file = os.path.join(tmp, "preview.ass") try: # Извлекаем первый кадр subprocess.run([ 'ffmpeg', '-y', '-i', video_path, '-vframes', '1', '-q:v', '2', frame ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if not os.path.exists(frame): return None # Фиксированный текст для превью example_text = "Тут ваши субтитры" styled_text = format_subtitle_text(example_text, size, bold) # Стиль style = { "FontName": font, "FontSize": int(size), "PrimaryColour": color, "BackColour": bg, "Bold": int(bold), "MarginV": int(margin), "Alignment": 2, "Outline": 1, "OutlineColour": "&H00000000", "BorderStyle": 1, "Shadow": 0, } style_str = style_to_force(style) # Создаём .ass with open(ass_file, "w", encoding="utf-8") as f: f.write("[Script Info]\n") f.write("ScriptType: v4.00+\n") f.write("PlayResX: 1920\n") f.write("PlayResY: 1080\n\n") f.write("[V4+ Styles]\n") f.write("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n") f.write(f"Style: Default,{style_str}\n\n") f.write("[Events]\n") f.write("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n") f.write(f"Dialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,{styled_text}\n") # Накладываем на кадр preview_img = os.path.join(tmp, "preview.jpg") safe_ass = ass_file.replace("\\", "/").replace(":", "\\:") subprocess.run([ 'ffmpeg', '-y', '-i', frame, '-vf', f"ass='{safe_ass}'", preview_img ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if os.path.exists(preview_img): return preview_img except Exception as e: print("Preview error:", e) traceback.print_exc() return None # === Полная обработка видео === def process(video, backend, preset, font, size, color, bg, bold, margin): if not video: return "❌ Нет видео", None, None, "", None tmp = tempfile.mkdtemp() wav = os.path.join(tmp, "audio.wav") try: subprocess.run(['ffmpeg', '-y', '-i', video, '-vn', '-ac', '1', '-ar', '16000', wav], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) segs = transcribe(wav, backend) if not segs: return "❌ Нет речи", None, None, "", None style = PRESETS[preset].copy() style.update({ "FontName": font, "FontSize": int(size), "PrimaryColour": color, "BackColour": bg, "Bold": int(bold), "MarginV": int(margin), }) preview_text = "" for i, s in enumerate(segs, 1): formatted = format_subtitle_text(s["text"], size, bold) preview_text += f"{i}. {formatted.replace(chr(10), ' / ')}\n" srt = os.path.join(tmp, "subs.srt") with open(srt, "w", encoding="utf-8") as f: for i, s in enumerate(segs, 1): txt = format_subtitle_text(s["text"], size, bold) f.write(f"{i}\n{format_srt_time(s['start'])} --> {format_srt_time(s['end'])}\n{txt}\n\n") out = f"result_{int(time.time())}.mp4" fs = style_to_force(style) safe_srt = srt.replace("\\", "/").replace(":", "\\:") vf = f"subtitles='{safe_srt}':force_style='{fs}'" subprocess.run(['ffmpeg', '-y', '-i', video, '-vf', vf, '-c:a', 'copy', out], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) return "✅ Готово", out, srt, preview_text, None except Exception as e: return f"❌ Ошибка: {str(e)}", None, None, "", None # === Интерфейс Gradio === with gr.Blocks() as demo: gr.Markdown("## 🎬 Автосубтитры (Whisper / FasterWhisper / NeMo) + LIVE preview + Превью стиля") with gr.Row(): with gr.Column(): video = gr.Video(label="Видео") backend = gr.Dropdown(ASR_BACKENDS, value="Whisper", label="ASR") preset = gr.Dropdown(list(PRESETS.keys()), value=list(PRESETS.keys())[0], label="Пресет") gr.Markdown("### 🎨 Ручная настройка") font = gr.Textbox("Montserrat", label="Шрифт") size = gr.Slider(minimum=10, maximum=96, value=32, step=1, label="Размер шрифта") color = gr.ColorPicker("#FFFFFF", label="Цвет текста") bg = gr.ColorPicker("#80000000", label="Фон") bold = gr.Checkbox(True, label="Bold") margin = gr.Slider(10, 100, 40, label="Отступ снизу") with gr.Row(): run_btn = gr.Button("🚀 Сгенерировать субтитры") preview_btn = gr.Button("👁️ Превью стиля") with gr.Column(): status = gr.Markdown() preview = gr.Textbox(label="LIVE preview текста субтитров", lines=8) preview_img = gr.Image(label="Превью стиля на кадре", type="filepath") out_video = gr.Video(label="Видео с субтитрами") out_srt = gr.File(label="SRT файл") run_btn.click( process, inputs=[video, backend, preset, font, size, color, bg, bold, margin], outputs=[status, out_video, out_srt, preview, preview_img] ) preview_btn.click( preview_subtitle_style, inputs=[video, font, size, color, bg, bold, margin], outputs=[preview_img] ) demo.queue().launch()