Auto-Sub / app.py
sahar-yaccov's picture
Update app.py
bbaf394 verified
import gradio as gr
import cv2
import numpy as np
import subprocess
import os
import torch
import whisper
from deep_translator import GoogleTranslator
from math import floor
import tempfile
# ---------------------------
# 驻讜谞拽爪讬讜转 注讬讘讜讚 讜讬讚讗讜
# ---------------------------
def draw_grid(frame, width, height, num_lines=5, line_color=(255, 255, 0), line_thickness=1):
marked_frame = frame.copy()
for i in range(1, num_lines):
x = floor(i * width / num_lines)
cv2.line(marked_frame, (x, 0), (x, height), line_color, line_thickness)
cv2.putText(marked_frame, str(x), (x + 5, height - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, line_color, 1, cv2.LINE_AA)
for i in range(1, num_lines):
y = floor(i * height / num_lines)
cv2.line(marked_frame, (0, y), (width, y), line_color, line_thickness)
cv2.putText(marked_frame, str(y), (10, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, line_color, 1, cv2.LINE_AA)
cv2.putText(marked_frame, '(0,0)', (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, line_color, 2, cv2.LINE_AA)
return marked_frame
def is_ffmpeg_available():
try:
subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
return True
except (subprocess.CalledProcessError, FileNotFoundError):
return False
def remove_watermark_from_frame(frame, mask_coords):
if mask_coords is None:
return frame
# 讘讚讬拽讛 讗诐 谞砖诇讞 tuple 专讙讬诇 (x1, y1, x2, y2)
if isinstance(mask_coords[0], int):
x1, y1, x2, y2 = mask_coords
else:
(x1, y1), (x2, y2) = mask_coords
mask = np.zeros(frame.shape[:2], dtype=np.uint8)
mask[y1:y2, x1:x2] = 255
frame = cv2.inpaint(frame, mask, 3, cv2.INPAINT_TELEA)
return frame
def extract_first_frame(video_file_path):
cap = cv2.VideoCapture(video_file_path)
ret, frame = cap.read()
cap.release()
if not ret:
return None
return frame
def frames_to_video(frames, output_path, fps, frame_size):
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, frame_size)
for frame in frames:
out.write(frame)
out.release()
return True
def remove_watermark_process(video_path, coords_input=None):
if not is_ffmpeg_available():
return None, "ffmpeg not available"
temp_output = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames = []
while True:
ret, frame = cap.read()
if not ret:
break
frames.append(remove_watermark_from_frame(frame, coords_input))
cap.release()
frames_to_video(frames, temp_output, fps, (width, height))
return temp_output, "Watermark removed"
def add_subtitles_process(video_path, video_voice_language='en', goal_transcript='en'):
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("small").to(device=device)
result = model.transcribe(video_path, language=video_voice_language)
segments = result["segments"]
max_words_per_segment = 6
new_segments = []
for seg in segments:
start = seg["start"]
end = seg["end"]
text = seg["text"].strip()
words = text.split()
duration = end - start
num_splits = (len(words) + max_words_per_segment - 1) // max_words_per_segment
split_duration = duration / num_splits if num_splits > 0 else duration
for i in range(num_splits):
split_text = " ".join(words[i * max_words_per_segment:(i + 1) * max_words_per_segment])
new_segments.append({"start": start + i * split_duration, "end": start + (i + 1) * split_duration, "text": split_text})
segments = new_segments
translator = GoogleTranslator(source=video_voice_language, target=goal_transcript)
srt_file = tempfile.NamedTemporaryFile(delete=False, suffix=".srt").name
def format_timestamp(t):
hours = int(t // 3600)
minutes = int((t % 3600) // 60)
seconds = int(t % 60)
millis = int((t % 1) * 1000)
return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
with open(srt_file, "w", encoding="utf-8") as f:
for i, seg in enumerate(segments, start=1):
text = seg["text"].strip()
if goal_transcript != video_voice_language:
try:
text = translator.translate(text)
except:
pass
f.write(f"{i}\n{format_timestamp(seg['start'])} --> {format_timestamp(seg['end'])}\n{text}\n\n")
final_output = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
font_name = 'Arial'
cmd = [
"ffmpeg", "-y",
"-i", video_path,
"-vf", f"subtitles={srt_file}:force_style='FontName={font_name},FontSize=20,PrimaryColour=&HFFFFFF&,BackColour=&H000000&,BorderStyle=3,Outline=1,Shadow=0'",
"-c:a", "copy",
final_output
]
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return final_output
# ---------------------------
# Gradio interface
# ---------------------------
def process_video(video_file_path, wm_coords=None, video_lang='en', subs_lang='en'):
# 讛住专转 住讬诪谉 诪讬诐
wm_coords_tuple = tuple(map(int, wm_coords.split(','))) if wm_coords else None
no_wm_path, wm_msg = remove_watermark_process(video_file_path, coords_input=wm_coords_tuple)
# 讛讜住驻转 讻转讜讘讬讜转
final_video_path = add_subtitles_process(no_wm_path, video_voice_language=video_lang, goal_transcript=subs_lang)
return final_video_path
# 讬爪讬专转 GUI 讘-Gradio
lang_options = {'注讘专讬转':'iw','讗谞讙诇讬转':'en','讛讬谞讚讬':'hi','住驻专讚讬转':'es','爪专驻转讬转':'fr','讙专诪谞讬转':'de','注专讘讬转':'ar'}
gr.Interface(
process_video,
inputs=[
gr.File(label="讘讞专 拽讜讘抓 讜讬讚讗讜"),
gr.Textbox(label="拽讜讗讜专讚讬谞讟讜转 诇讛住专转 住讬诪谉 诪讬诐 (x1,y1,x2,y2)", placeholder="诇诪砖诇: 0,0,200,50"),
gr.Dropdown(list(lang_options.keys()), value='讗谞讙诇讬转', label="砖驻转 讗讜讚讬讜 诪拽讜专讬转"),
gr.Dropdown(list(lang_options.keys()), value='注讘专讬转', label="砖驻转 讻转讜讘讬讜转")
],
outputs=gr.Video(label="讜讬讚讗讜 住讜驻讬 注诐 讻转讜讘讬讜转"),
title="馃幀 讻诇讬 注讬讘讜讚 讜讬讚讗讜 - 讛住专转 住讬诪谉 诪讬诐 讜讛讜住驻转 讻转讜讘讬讜转",
description="讛注诇讛 讜讬讚讗讜, 讘讞专 讗讝讜专 住讬诪谉 诪讬诐, 讘爪注 转诪诇讜诇 讜讛讜住驻转 讻转讜讘讬讜转."
).launch()