| | import os |
| | import subprocess |
| | import tempfile |
| | import traceback |
| | import time |
| | import shutil |
| | import torch |
| | import gradio as gr |
| | from textwrap import wrap |
| |
|
| | from nemo.collections.asr.models import EncDecRNNTBPEModel |
| | from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline |
| | from faster_whisper import WhisperModel as FasterWhisperModel |
| |
|
| |
|
| | ASR_BACKENDS = ["Whisper", "FasterWhisper", "NeMoParakeet"] |
| |
|
| | WHISPER_MODEL = "antony66/whisper-large-v3-russian" |
| | FASTERW_MODEL = "Ash8181/whisper-large-v3-russian-ct2" |
| | PARAKEET_MODEL = "nvidia/parakeet-tdt-0.6b-v3" |
| |
|
| | CHUNK_LENGTH = 307 |
| |
|
| | PRESETS = { |
| | "Viral Shorts (TOP)": { |
| | "Alignment": 8, |
| | "FontName": "Arial Black", |
| | "FontSize": 64, |
| | "PrimaryColour": "#FFFF00", |
| | "Outline": 4, |
| | "OutlineColour": "#000000", |
| | "Shadow": 1, |
| | "BackColour": "#80000000", |
| | "Bold": 1, |
| | "BorderStyle": 1, |
| | "MarginV": 40, |
| | }, |
| | "Минимал низ": { |
| | "Alignment": 2, |
| | "FontName": "Montserrat", |
| | "FontSize": 28, |
| | "PrimaryColour": "#17FC03", |
| | "Outline": 1, |
| | "OutlineColour": "#000000", |
| | "Shadow": 0, |
| | "BackColour": "#80000000", |
| | "Bold": 1, |
| | "BorderStyle": 1, |
| | "MarginV": 40, |
| | }, |
| | } |
| |
|
| | _cache = {} |
| |
|
| | def format_srt_time(sec): |
| | h = int(sec // 3600) |
| | m = int((sec % 3600) // 60) |
| | s = int(sec % 60) |
| | ms = int((sec - int(sec)) * 1000) |
| | return f"{h:02}:{m:02}:{s:02},{ms:03}" |
| |
|
| | def color_hex_to_ass(val): |
| | if isinstance(val, str) and val.startswith("#"): |
| | val = val.lstrip("#") |
| | if len(val) == 6: |
| | val = "00" + val |
| | aa, rr, gg, bb = val[:2], val[2:4], val[4:6], val[6:8] |
| | return f"&H{aa}{bb}{gg}{rr}" |
| | return val |
| |
|
| | def style_to_force(style): |
| | parts = [] |
| | for k, v in style.items(): |
| | if "Colour" in k: |
| | v = color_hex_to_ass(v) |
| | parts.append(f"{k}={v}") |
| | return ",".join(parts) |
| |
|
| | def format_subtitle_text(text, font_size, bold): |
| | text = text.strip() |
| | if bold and font_size >= 48: |
| | text = text.upper() |
| | else: |
| | text = text.capitalize() |
| | |
| | |
| | if font_size >= 60: |
| | wrap_width = 12 |
| | elif font_size >= 48: |
| | wrap_width = 16 |
| | elif font_size >= 36: |
| | wrap_width = 24 |
| | else: |
| | wrap_width = 36 |
| | |
| | return "\n".join(wrap(text, wrap_width)) |
| |
|
| | |
| | def get_whisper(): |
| | if "whisper" in _cache: |
| | return _cache["whisper"] |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | dtype = torch.float16 if device == "cuda" else torch.float32 |
| | processor = WhisperProcessor.from_pretrained(WHISPER_MODEL) |
| | model = WhisperForConditionalGeneration.from_pretrained( |
| | WHISPER_MODEL, torch_dtype=dtype, low_cpu_mem_usage=True |
| | ) |
| | pipe = pipeline( |
| | "automatic-speech-recognition", |
| | model=model, |
| | tokenizer=processor.tokenizer, |
| | feature_extractor=processor.feature_extractor, |
| | return_timestamps=True, |
| | chunk_length_s=CHUNK_LENGTH, |
| | device=0 if device == "cuda" else -1, |
| | ) |
| | _cache["whisper"] = pipe |
| | return pipe |
| |
|
| | def get_faster_whisper(): |
| | if "faster" in _cache: |
| | return _cache["faster"] |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | compute = "float16" if device == "cuda" else "int8" |
| | model = FasterWhisperModel(FASTERW_MODEL, device=device, compute_type=compute) |
| | _cache["faster"] = model |
| | return model |
| |
|
| | def get_parakeet(): |
| | if "parakeet" in _cache: |
| | return _cache["parakeet"] |
| | model = EncDecRNNTBPEModel.from_pretrained(PARAKEET_MODEL) |
| | model.eval() |
| | model = model.to("cuda" if torch.cuda.is_available() else "cpu") |
| | _cache["parakeet"] = model |
| | return model |
| |
|
| | |
| | def transcribe(audio, backend): |
| | if backend == "Whisper": |
| | pipe = get_whisper() |
| | res = pipe(audio, generate_kwargs={"language": "russian"}) |
| | chunks = [{"start": c["timestamp"][0], "end": c["timestamp"][1], "text": c["text"]} for c in res["chunks"]] |
| | return chunks |
| | if backend == "FasterWhisper": |
| | model = get_faster_whisper() |
| | segs, _ = model.transcribe(audio, language="ru") |
| | return [{"start": s.start, "end": s.end, "text": s.text} for s in segs] |
| | model = get_parakeet() |
| | out = model.transcribe([audio], timestamps=True)[0].timestamp["word"] |
| | chunks = [] |
| | step = 6 |
| | for i in range(0, len(out), step): |
| | g = out[i:i+step] |
| | chunks.append({ |
| | "start": g[0]["start"], |
| | "end": g[-1]["end"], |
| | "text": " ".join(w["word"] for w in g) |
| | }) |
| | return chunks |
| |
|
| | |
| | def preview_subtitle_style(video_path, font, size, color, bg, bold, margin): |
| | if not video_path: |
| | return None |
| |
|
| | tmp = tempfile.mkdtemp() |
| | frame = os.path.join(tmp, "frame.jpg") |
| | ass_file = os.path.join(tmp, "preview.ass") |
| |
|
| | try: |
| | |
| | subprocess.run([ |
| | 'ffmpeg', '-y', '-i', video_path, '-vframes', '1', '-q:v', '2', frame |
| | ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) |
| |
|
| | if not os.path.exists(frame): |
| | return None |
| |
|
| | |
| | example_text = "Тут ваши субтитры" |
| | styled_text = format_subtitle_text(example_text, size, bold) |
| |
|
| | |
| | style = { |
| | "FontName": font, |
| | "FontSize": int(size), |
| | "PrimaryColour": color, |
| | "BackColour": bg, |
| | "Bold": int(bold), |
| | "MarginV": int(margin), |
| | "Alignment": 2, |
| | "Outline": 1, |
| | "OutlineColour": "&H00000000", |
| | "BorderStyle": 1, |
| | "Shadow": 0, |
| | } |
| | style_str = style_to_force(style) |
| |
|
| | |
| | with open(ass_file, "w", encoding="utf-8") as f: |
| | f.write("[Script Info]\n") |
| | f.write("ScriptType: v4.00+\n") |
| | f.write("PlayResX: 1920\n") |
| | f.write("PlayResY: 1080\n\n") |
| | f.write("[V4+ Styles]\n") |
| | f.write("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n") |
| | f.write(f"Style: Default,{style_str}\n\n") |
| | f.write("[Events]\n") |
| | f.write("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n") |
| | f.write(f"Dialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,{styled_text}\n") |
| |
|
| | |
| | preview_img = os.path.join(tmp, "preview.jpg") |
| | safe_ass = ass_file.replace("\\", "/").replace(":", "\\:") |
| | subprocess.run([ |
| | 'ffmpeg', '-y', '-i', frame, '-vf', f"ass='{safe_ass}'", preview_img |
| | ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) |
| |
|
| | if os.path.exists(preview_img): |
| | return preview_img |
| |
|
| | except Exception as e: |
| | print("Preview error:", e) |
| | traceback.print_exc() |
| |
|
| | return None |
| |
|
| | |
| | def process(video, backend, preset, font, size, color, bg, bold, margin): |
| | if not video: |
| | return "❌ Нет видео", None, None, "", None |
| |
|
| | tmp = tempfile.mkdtemp() |
| | wav = os.path.join(tmp, "audio.wav") |
| |
|
| | try: |
| | subprocess.run(['ffmpeg', '-y', '-i', video, '-vn', '-ac', '1', '-ar', '16000', wav], |
| | check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) |
| |
|
| | segs = transcribe(wav, backend) |
| | if not segs: |
| | return "❌ Нет речи", None, None, "", None |
| |
|
| | style = PRESETS[preset].copy() |
| | style.update({ |
| | "FontName": font, |
| | "FontSize": int(size), |
| | "PrimaryColour": color, |
| | "BackColour": bg, |
| | "Bold": int(bold), |
| | "MarginV": int(margin), |
| | }) |
| |
|
| | preview_text = "" |
| | for i, s in enumerate(segs, 1): |
| | formatted = format_subtitle_text(s["text"], size, bold) |
| | preview_text += f"{i}. {formatted.replace(chr(10), ' / ')}\n" |
| |
|
| | srt = os.path.join(tmp, "subs.srt") |
| | with open(srt, "w", encoding="utf-8") as f: |
| | for i, s in enumerate(segs, 1): |
| | txt = format_subtitle_text(s["text"], size, bold) |
| | f.write(f"{i}\n{format_srt_time(s['start'])} --> {format_srt_time(s['end'])}\n{txt}\n\n") |
| |
|
| | out = f"result_{int(time.time())}.mp4" |
| | fs = style_to_force(style) |
| | safe_srt = srt.replace("\\", "/").replace(":", "\\:") |
| | vf = f"subtitles='{safe_srt}':force_style='{fs}'" |
| |
|
| | subprocess.run(['ffmpeg', '-y', '-i', video, '-vf', vf, '-c:a', 'copy', out], |
| | check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) |
| |
|
| | return "✅ Готово", out, srt, preview_text, None |
| |
|
| | except Exception as e: |
| | return f"❌ Ошибка: {str(e)}", None, None, "", None |
| |
|
| | |
| | with gr.Blocks() as demo: |
| | gr.Markdown("## 🎬 Автосубтитры (Whisper / FasterWhisper / NeMo) + LIVE preview + Превью стиля") |
| |
|
| | with gr.Row(): |
| | with gr.Column(): |
| | video = gr.Video(label="Видео") |
| | backend = gr.Dropdown(ASR_BACKENDS, value="Whisper", label="ASR") |
| | preset = gr.Dropdown(list(PRESETS.keys()), value=list(PRESETS.keys())[0], label="Пресет") |
| |
|
| | gr.Markdown("### 🎨 Ручная настройка") |
| | font = gr.Textbox("Montserrat", label="Шрифт") |
| | size = gr.Slider(minimum=10, maximum=96, value=32, step=1, label="Размер шрифта") |
| | color = gr.ColorPicker("#FFFFFF", label="Цвет текста") |
| | bg = gr.ColorPicker("#80000000", label="Фон") |
| | bold = gr.Checkbox(True, label="Bold") |
| | margin = gr.Slider(10, 100, 40, label="Отступ снизу") |
| |
|
| | with gr.Row(): |
| | run_btn = gr.Button("🚀 Сгенерировать субтитры") |
| | preview_btn = gr.Button("👁️ Превью стиля") |
| |
|
| | with gr.Column(): |
| | status = gr.Markdown() |
| | preview = gr.Textbox(label="LIVE preview текста субтитров", lines=8) |
| | preview_img = gr.Image(label="Превью стиля на кадре", type="filepath") |
| | out_video = gr.Video(label="Видео с субтитрами") |
| | out_srt = gr.File(label="SRT файл") |
| |
|
| | run_btn.click( |
| | process, |
| | inputs=[video, backend, preset, font, size, color, bg, bold, margin], |
| | outputs=[status, out_video, out_srt, preview, preview_img] |
| | ) |
| |
|
| | preview_btn.click( |
| | preview_subtitle_style, |
| | inputs=[video, font, size, color, bg, bold, margin], |
| | outputs=[preview_img] |
| | ) |
| |
|
| | demo.queue().launch() |