Rus_Video_captionning

Sleeping

App Files Files Community

Rus_Video_captionning / app.py

AndreySokolov01

Update app.py

23f1f1e verified about 1 month ago

raw

history blame contribute delete

11.2 kB

	import os
	import subprocess
	import tempfile
	import traceback
	import time
	import shutil
	import torch
	import gradio as gr
	from textwrap import wrap

	from nemo.collections.asr.models import EncDecRNNTBPEModel
	from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
	from faster_whisper import WhisperModel as FasterWhisperModel


	ASR_BACKENDS = ["Whisper", "FasterWhisper", "NeMoParakeet"]

	WHISPER_MODEL = "antony66/whisper-large-v3-russian"
	FASTERW_MODEL = "Ash8181/whisper-large-v3-russian-ct2"
	PARAKEET_MODEL = "nvidia/parakeet-tdt-0.6b-v3"

	CHUNK_LENGTH = 307

	PRESETS = {
	"Viral Shorts (TOP)": {
	"Alignment": 8,
	"FontName": "Arial Black",
	"FontSize": 64,
	"PrimaryColour": "#FFFF00",
	"Outline": 4,
	"OutlineColour": "#000000",
	"Shadow": 1,
	"BackColour": "#80000000",
	"Bold": 1,
	"BorderStyle": 1,
	"MarginV": 40,
	},
	"Минимал низ": {
	"Alignment": 2,
	"FontName": "Montserrat",
	"FontSize": 28,
	"PrimaryColour": "#17FC03",
	"Outline": 1,
	"OutlineColour": "#000000",
	"Shadow": 0,
	"BackColour": "#80000000",
	"Bold": 1,
	"BorderStyle": 1,
	"MarginV": 40,
	},
	}

	_cache = {}

	def format_srt_time(sec):
	h = int(sec // 3600)
	m = int((sec % 3600) // 60)
	s = int(sec % 60)
	ms = int((sec - int(sec)) * 1000)
	return f"{h:02}:{m:02}:{s:02},{ms:03}"

	def color_hex_to_ass(val):
	if isinstance(val, str) and val.startswith("#"):
	val = val.lstrip("#")
	if len(val) == 6:
	val = "00" + val
	aa, rr, gg, bb = val[:2], val[2:4], val[4:6], val[6:8]
	return f"&H{aa}{bb}{gg}{rr}"
	return val

	def style_to_force(style):
	parts = []
	for k, v in style.items():
	if "Colour" in k:
	v = color_hex_to_ass(v)
	parts.append(f"{k}={v}")
	return ",".join(parts)

	def format_subtitle_text(text, font_size, bold):
	text = text.strip()
	if bold and font_size >= 48:
	text = text.upper()
	else:
	text = text.capitalize()

	# Адаптивная ширина переноса
	if font_size >= 60:
	wrap_width = 12
	elif font_size >= 48:
	wrap_width = 16
	elif font_size >= 36:
	wrap_width = 24
	else:
	wrap_width = 36

	return "\n".join(wrap(text, wrap_width))

	# === Загрузка моделей ===
	def get_whisper():
	if "whisper" in _cache:
	return _cache["whisper"]
	device = "cuda" if torch.cuda.is_available() else "cpu"
	dtype = torch.float16 if device == "cuda" else torch.float32
	processor = WhisperProcessor.from_pretrained(WHISPER_MODEL)
	model = WhisperForConditionalGeneration.from_pretrained(
	WHISPER_MODEL, torch_dtype=dtype, low_cpu_mem_usage=True
	)
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	return_timestamps=True,
	chunk_length_s=CHUNK_LENGTH,
	device=0 if device == "cuda" else -1,
	)
	_cache["whisper"] = pipe
	return pipe

	def get_faster_whisper():
	if "faster" in _cache:
	return _cache["faster"]
	device = "cuda" if torch.cuda.is_available() else "cpu"
	compute = "float16" if device == "cuda" else "int8"
	model = FasterWhisperModel(FASTERW_MODEL, device=device, compute_type=compute)
	_cache["faster"] = model
	return model

	def get_parakeet():
	if "parakeet" in _cache:
	return _cache["parakeet"]
	model = EncDecRNNTBPEModel.from_pretrained(PARAKEET_MODEL)
	model.eval()
	model = model.to("cuda" if torch.cuda.is_available() else "cpu")
	_cache["parakeet"] = model
	return model

	# === Транскрибация ===
	def transcribe(audio, backend):
	if backend == "Whisper":
	pipe = get_whisper()
	res = pipe(audio, generate_kwargs={"language": "russian"})
	chunks = [{"start": c["timestamp"][0], "end": c["timestamp"][1], "text": c["text"]} for c in res["chunks"]]
	return chunks
	if backend == "FasterWhisper":
	model = get_faster_whisper()
	segs, _ = model.transcribe(audio, language="ru")
	return [{"start": s.start, "end": s.end, "text": s.text} for s in segs]
	model = get_parakeet()
	out = model.transcribe([audio], timestamps=True)[0].timestamp["word"]
	chunks = []
	step = 6
	for i in range(0, len(out), step):
	g = out[i:i+step]
	chunks.append({
	"start": g[0]["start"],
	"end": g[-1]["end"],
	"text": " ".join(w["word"] for w in g)
	})
	return chunks

	# === Превью стиля (без транскрибации!) ===
	def preview_subtitle_style(video_path, font, size, color, bg, bold, margin):
	if not video_path:
	return None

	tmp = tempfile.mkdtemp()
	frame = os.path.join(tmp, "frame.jpg")
	ass_file = os.path.join(tmp, "preview.ass")

	try:
	# Извлекаем первый кадр
	subprocess.run([
	'ffmpeg', '-y', '-i', video_path, '-vframes', '1', '-q:v', '2', frame
	], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

	if not os.path.exists(frame):
	return None

	# Фиксированный текст для превью
	example_text = "Тут ваши субтитры"
	styled_text = format_subtitle_text(example_text, size, bold)

	# Стиль
	style = {
	"FontName": font,
	"FontSize": int(size),
	"PrimaryColour": color,
	"BackColour": bg,
	"Bold": int(bold),
	"MarginV": int(margin),
	"Alignment": 2,
	"Outline": 1,
	"OutlineColour": "&H00000000",
	"BorderStyle": 1,
	"Shadow": 0,
	}
	style_str = style_to_force(style)

	# Создаём .ass
	with open(ass_file, "w", encoding="utf-8") as f:
	f.write("[Script Info]\n")
	f.write("ScriptType: v4.00+\n")
	f.write("PlayResX: 1920\n")
	f.write("PlayResY: 1080\n\n")
	f.write("[V4+ Styles]\n")
	f.write("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n")
	f.write(f"Style: Default,{style_str}\n\n")
	f.write("[Events]\n")
	f.write("Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n")
	f.write(f"Dialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,{styled_text}\n")

	# Накладываем на кадр
	preview_img = os.path.join(tmp, "preview.jpg")
	safe_ass = ass_file.replace("\\", "/").replace(":", "\\:")
	subprocess.run([
	'ffmpeg', '-y', '-i', frame, '-vf', f"ass='{safe_ass}'", preview_img
	], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

	if os.path.exists(preview_img):
	return preview_img

	except Exception as e:
	print("Preview error:", e)
	traceback.print_exc()

	return None

	# === Полная обработка видео ===
	def process(video, backend, preset, font, size, color, bg, bold, margin):
	if not video:
	return "❌ Нет видео", None, None, "", None

	tmp = tempfile.mkdtemp()
	wav = os.path.join(tmp, "audio.wav")

	try:
	subprocess.run(['ffmpeg', '-y', '-i', video, '-vn', '-ac', '1', '-ar', '16000', wav],
	check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

	segs = transcribe(wav, backend)
	if not segs:
	return "❌ Нет речи", None, None, "", None

	style = PRESETS[preset].copy()
	style.update({
	"FontName": font,
	"FontSize": int(size),
	"PrimaryColour": color,
	"BackColour": bg,
	"Bold": int(bold),
	"MarginV": int(margin),
	})

	preview_text = ""
	for i, s in enumerate(segs, 1):
	formatted = format_subtitle_text(s["text"], size, bold)
	preview_text += f"{i}. {formatted.replace(chr(10), ' / ')}\n"

	srt = os.path.join(tmp, "subs.srt")
	with open(srt, "w", encoding="utf-8") as f:
	for i, s in enumerate(segs, 1):
	txt = format_subtitle_text(s["text"], size, bold)
	f.write(f"{i}\n{format_srt_time(s['start'])} --> {format_srt_time(s['end'])}\n{txt}\n\n")

	out = f"result_{int(time.time())}.mp4"
	fs = style_to_force(style)
	safe_srt = srt.replace("\\", "/").replace(":", "\\:")
	vf = f"subtitles='{safe_srt}':force_style='{fs}'"

	subprocess.run(['ffmpeg', '-y', '-i', video, '-vf', vf, '-c:a', 'copy', out],
	check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

	return "✅ Готово", out, srt, preview_text, None

	except Exception as e:
	return f"❌ Ошибка: {str(e)}", None, None, "", None

	# === Интерфейс Gradio ===
	with gr.Blocks() as demo:
	gr.Markdown("## 🎬 Автосубтитры (Whisper / FasterWhisper / NeMo) + LIVE preview + Превью стиля")

	with gr.Row():
	with gr.Column():
	video = gr.Video(label="Видео")
	backend = gr.Dropdown(ASR_BACKENDS, value="Whisper", label="ASR")
	preset = gr.Dropdown(list(PRESETS.keys()), value=list(PRESETS.keys())[0], label="Пресет")

	gr.Markdown("### 🎨 Ручная настройка")
	font = gr.Textbox("Montserrat", label="Шрифт")
	size = gr.Slider(minimum=10, maximum=96, value=32, step=1, label="Размер шрифта")
	color = gr.ColorPicker("#FFFFFF", label="Цвет текста")
	bg = gr.ColorPicker("#80000000", label="Фон")
	bold = gr.Checkbox(True, label="Bold")
	margin = gr.Slider(10, 100, 40, label="Отступ снизу")

	with gr.Row():
	run_btn = gr.Button("🚀 Сгенерировать субтитры")
	preview_btn = gr.Button("👁️ Превью стиля")

	with gr.Column():
	status = gr.Markdown()
	preview = gr.Textbox(label="LIVE preview текста субтитров", lines=8)
	preview_img = gr.Image(label="Превью стиля на кадре", type="filepath")
	out_video = gr.Video(label="Видео с субтитрами")
	out_srt = gr.File(label="SRT файл")

	run_btn.click(
	process,
	inputs=[video, backend, preset, font, size, color, bg, bold, margin],
	outputs=[status, out_video, out_srt, preview, preview_img]
	)

	preview_btn.click(
	preview_subtitle_style,
	inputs=[video, font, size, color, bg, bold, margin],
	outputs=[preview_img]
	)

	demo.queue().launch()