Yta-Tools-V2 / caption.py
rx66y's picture
Upload 10 files
031627f verified
import os
import time
import subprocess
import numpy as np
import cv2
import requests
import gradio as gr
from PIL import Image
try:
import onnxruntime as ort
_ort_available = True
except Exception:
_ort_available = False
YOLO_MODEL_URL = "https://github.com/hjunior29/video-text-remover/raw/main/models/text_detector/best.onnx"
YOLO_MODEL_PATH = "./models/text_detector/best.onnx"
_yolo_session = None
def _load_yolo():
global _yolo_session
if _yolo_session is not None:
return _yolo_session
if not _ort_available:
return None
os.makedirs(os.path.dirname(YOLO_MODEL_PATH), exist_ok=True)
if not os.path.exists(YOLO_MODEL_PATH):
print("Downloading YOLO11 text detector model...")
r = requests.get(YOLO_MODEL_URL, timeout=120, stream=True)
r.raise_for_status()
with open(YOLO_MODEL_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
_yolo_session = ort.InferenceSession(YOLO_MODEL_PATH, providers=providers)
return _yolo_session
def _yolo_preprocess(frame_bgr, input_size=640):
h, w = frame_bgr.shape[:2]
scale = input_size / max(h, w)
nh, nw = int(h * scale), int(w * scale)
resized = cv2.resize(frame_bgr, (nw, nh))
padded = np.zeros((input_size, input_size, 3), dtype=np.uint8)
padded[:nh, :nw] = resized
rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
tensor = np.transpose(rgb, (2, 0, 1))[np.newaxis]
return tensor, scale, nh, nw
def _yolo_postprocess(outputs, scale, orig_h, orig_w, nw, nh, conf_thresh=0.25, margin=5):
preds = outputs[0]
if preds.ndim == 3 and preds.shape[1] < preds.shape[2]:
preds = preds.transpose(0, 2, 1)
preds = preds[0]
mask = np.zeros((orig_h, orig_w), dtype=np.uint8)
for det in preds:
if det.shape[0] < 5: continue
conf = float(det[4:].max())
if conf < conf_thresh: continue
cx, cy, bw, bh = det[:4]
x1 = max(0, int((cx - bw/2) / scale) - margin)
y1 = max(0, int((cy - bh/2) / scale) - margin)
x2 = min(orig_w, int((cx + bw/2) / scale) + margin)
y2 = min(orig_h, int((cy + bh/2) / scale) + margin)
cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
return mask
def detect_text_mask_yolo(frame_bgr):
try:
sess = _load_yolo()
if sess is None: return None
tensor, scale, nh, nw = _yolo_preprocess(frame_bgr)
input_name = sess.get_inputs()[0].name
outputs = sess.run(None, {input_name: tensor})
h, w = frame_bgr.shape[:2]
return _yolo_postprocess(outputs, scale, h, w, nw, nh)
except Exception as e:
print(f"YOLO detect error: {e}")
return None
def detect_caption_bbox_fallback(frame_bgr):
h, w = frame_bgr.shape[:2]
gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
best_box, best_area = None, 0
for y_start, y_end in [(int(h*0.65), h), (0, int(h*0.12))]:
if y_end <= y_start: continue
region = gray[y_start:y_end, :]
_, thresh = cv2.threshold(region, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
kern = cv2.getStructuringElement(cv2.MORPH_RECT, (w//8, 3))
closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kern)
contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for cnt in contours:
cx, cy, cw, ch2 = cv2.boundingRect(cnt)
area = cw * ch2
if cw < w*0.2 or ch2 < 8 or ch2 > h*0.25: continue
if area > best_area:
best_area = area
pad = 6
best_box = (max(0, cx-pad), max(0, y_start+cy-pad),
min(w, cx+cw+pad*2), min(h, y_start+cy+ch2+pad*2))
return best_box
def get_video_dims(video_path):
cap = cv2.VideoCapture(video_path)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
return w, h
def extract_frame_at(video_path, timestamp_sec):
if not video_path: return None, "Upload a video first."
try:
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
total_secs = total_frames / fps
ts = max(0.0, min(float(timestamp_sec), total_secs - 0.1))
cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
ret, frame = cap.read()
cap.release()
if not ret:
cap2 = cv2.VideoCapture(video_path)
ret, frame = cap2.read()
cap2.release()
if not ret: return None, "Could not read frame."
return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)), \
f"Frame at {ts:.1f}s loaded. Paint over captions then click REMOVE CAPTIONS."
except Exception as e:
return None, f"Error: {e}"
def run_opencv_inpaint(abs_input, output_path, mask_img, method='hybrid'):
try:
vid_w, vid_h = get_video_dims(abs_input)
temp_vid = output_path + "_tmp.mp4"
temp_aud = output_path + "_tmp.aac"
subprocess.run(["ffmpeg", "-y", "-i", abs_input, "-vn", "-c:a", "copy", temp_aud],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
cap = cv2.VideoCapture(abs_input)
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(temp_vid, fourcc, fps, (vid_w, vid_h))
mask = cv2.resize(mask_img, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST)
frame_idx = 0
t_start = time.time()
while True:
ret, frame = cap.read()
if not ret: break
if method == 'hybrid':
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
expanded = cv2.dilate(mask, kernel, iterations=1)
inpainted = cv2.inpaint(frame, expanded, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
elif method == 'ns':
inpainted = cv2.inpaint(frame, mask, inpaintRadius=3, flags=cv2.INPAINT_NS)
elif method == 'blur':
inpainted = frame.copy()
blurred = cv2.GaussianBlur(frame, (51, 51), 30)
inpainted[mask == 255] = blurred[mask == 255]
elif method == 'black':
inpainted = frame.copy()
inpainted[mask == 255] = 0
elif method == 'bg':
inpainted = frame.copy()
kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
border = cv2.dilate(mask, kernel2) - mask
mean_color = cv2.mean(frame, mask=border)[:3]
inpainted[mask == 255] = [int(mean_color[0]), int(mean_color[1]), int(mean_color[2])]
else:
inpainted = cv2.inpaint(frame, mask, inpaintRadius=2, flags=cv2.INPAINT_TELEA)
out.write(inpainted)
frame_idx += 1
elapsed = time.time() - t_start
fps_rate = frame_idx / max(elapsed, 0.1)
cap.release()
out.release()
has_audio = os.path.exists(temp_aud) and os.path.getsize(temp_aud) > 0
if has_audio:
subprocess.run(["ffmpeg", "-y", "-i", temp_vid, "-i", temp_aud,
"-c:v", "libx264", "-preset", "fast", "-crf", "18",
"-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart",
output_path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
else:
subprocess.run(["ffmpeg", "-y", "-i", temp_vid,
"-c:v", "libx264", "-preset", "fast", "-crf", "18",
"-movflags", "+faststart", output_path],
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
for f in [temp_vid, temp_aud]:
if os.path.exists(f): os.remove(f)
return True, f"{frame_idx} frames in {elapsed:.1f}s ({fps_rate:.1f} fps)"
except Exception as e:
import traceback
return False, traceback.format_exc()
def auto_preview(video_path, timestamp_sec=3.0):
if not video_path: return None, "Upload a video first."
try:
ts = max(0.0, float(timestamp_sec))
cap = cv2.VideoCapture(video_path)
cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
ret, frame = cap.read()
cap.release()
if not ret: return None, "Could not read frame."
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
h, w = frame.shape[:2]
mask = detect_text_mask_yolo(frame)
used = "YOLO11"
if mask is None or mask.max() == 0:
box = detect_caption_bbox_fallback(frame)
used = "OpenCV fallback"
if box is None:
return Image.fromarray(rgb), "No captions detected. Try Manual mode or adjust timestamp."
mask = np.zeros((h, w), dtype=np.uint8)
x1, y1, x2, y2 = box
cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
preview = rgb.copy()
overlay = rgb.copy()
overlay[mask == 255] = [220, 40, 40]
blended = cv2.addWeighted(preview, 0.5, overlay, 0.5, 0)
n_pixels = int(mask.sum() / 255)
return Image.fromarray(blended), f"{n_pixels} text pixels found via {used}. Red = will be removed."
except Exception as e:
return None, f"Error: {e}"
def auto_remove(video_path, timestamp_sec=3.0, inpaint_method='hybrid', save_folder="./Caption-Removed/"):
if not video_path: return None, "No video uploaded."
os.makedirs(save_folder, exist_ok=True)
abs_input = os.path.abspath(video_path)
base_name = os.path.splitext(os.path.basename(video_path))[0]
output_path = os.path.abspath(os.path.join(save_folder, f"{base_name}_NO_CAPTIONS.mp4"))
try:
cap = cv2.VideoCapture(abs_input)
cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, float(timestamp_sec)) * 1000)
ret, frame = cap.read()
cap.release()
if not ret: return None, "Could not read frame."
vid_w, vid_h = get_video_dims(abs_input)
mask = detect_text_mask_yolo(frame)
used = "YOLO11"
if mask is None or mask.max() == 0:
box = detect_caption_bbox_fallback(frame)
used = "OpenCV fallback"
if box is None: return None, "No captions detected. Try Manual mode."
mask = np.zeros((vid_h, vid_w), dtype=np.uint8)
x1, y1, x2, y2 = box
cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
else:
mask = cv2.resize(mask, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST)
ok, info = run_opencv_inpaint(abs_input, output_path, mask, method=inpaint_method)
if not ok: return None, f"Inpaint failed:\n{info}"
size_mb = os.path.getsize(output_path) / (1024 * 1024)
return output_path, f"Done!\nSIZE ▸ {size_mb:.1f} MB\nMETHOD ▸ {used} + {inpaint_method.upper()}\nTIME ▸ {info}"
except Exception as e:
import traceback
return None, f"Error:\n{traceback.format_exc()}"
def manual_extract_frame(video_path, timestamp_sec):
return extract_frame_at(video_path, timestamp_sec)
def manual_remove(video_path, editor_value, inpaint_method='hybrid', save_folder="./Caption-Removed/"):
if not video_path: return None, "No video uploaded."
if editor_value is None: return None, "Extract frame and paint over captions first."
os.makedirs(save_folder, exist_ok=True)
abs_input = os.path.abspath(video_path)
base_name = os.path.splitext(os.path.basename(video_path))[0]
output_path = os.path.abspath(os.path.join(save_folder, f"{base_name}_NO_CAPTIONS.mp4"))
try:
vid_w, vid_h = get_video_dims(abs_input)
layers = editor_value.get("layers", [])
composite = editor_value.get("composite")
background = editor_value.get("background")
mask = np.zeros((vid_h, vid_w), dtype=np.uint8)
if layers:
for layer in layers:
if layer is None: continue
lnp = np.array(layer.convert("RGBA"))
lnp = cv2.resize(lnp, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST)
mask[lnp[:, :, 3] > 10] = 255
if mask.max() == 0 and composite is not None and background is not None:
cnp = cv2.resize(np.array(composite.convert("RGB")), (vid_w, vid_h))
bnp = cv2.resize(np.array(background.convert("RGB")), (vid_w, vid_h))
diff = cv2.cvtColor(cv2.absdiff(cnp, bnp), cv2.COLOR_RGB2GRAY)
mask[diff > 15] = 255
if mask.max() == 0: return None, "No strokes detected. Paint over the captions."
ok, info = run_opencv_inpaint(abs_input, output_path, mask, method=inpaint_method)
if not ok: return None, f"Inpaint failed:\n{info}"
size_mb = os.path.getsize(output_path) / (1024 * 1024)
return output_path, f"Done!\nSIZE ▸ {size_mb:.1f} MB\nMETHOD ▸ Manual + {inpaint_method.upper()}\nTIME ▸ {info}"
except Exception as e:
import traceback
return None, f"Error:\n{traceback.format_exc()}"
def build_tab():
with gr.Tab("🖌️ CAPTION REMOVER"):
gr.HTML("""<div style="padding:16px 4px 6px;">
<div style="font-family:'Orbitron',sans-serif;font-size:.65rem;font-weight:700;color:#00d4ff;letter-spacing:.2em;text-transform:uppercase;margin-bottom:6px;">YOLO11 + OpenCV Inpainting</div>
<div style="font-family:'Share Tech Mono',monospace;font-size:.75rem;color:#2a5570;line-height:1.9;">
Upload video, scrub to where captions appear, use Auto or Manual.<br>
<span style="color:#ff3c6e;">Complex backgrounds may still show minor artifacts on CPU.</span>
</div></div>""")
with gr.Row():
with gr.Column(scale=1):
cap_video_upload = gr.File(label="Upload Video", file_types=[".mp4",".mov",".mkv",".avi",".webm"])
with gr.Column(scale=2):
cap_video_player = gr.Video(label="Preview - scrub to find captions", interactive=False, height=300)
cap_video_upload.change(fn=lambda f: f, inputs=[cap_video_upload], outputs=[cap_video_player])
cap_timestamp = gr.Slider(label="Timestamp to sample caption detection (seconds)", minimum=0, maximum=300, value=3, step=0.5)
gr.HTML("<div style='height:1px;background:#0d2137;margin:8px 0;'></div>")
with gr.Tabs():
with gr.Tab("🤖 AUTO DETECT"):
with gr.Row(equal_height=True):
with gr.Column(scale=1):
auto_method = gr.Dropdown(label="Inpaint Method", choices=["hybrid","telea","ns","blur","black","bg"], value="hybrid")
auto_preview_btn = gr.Button("PREVIEW DETECTION", variant="secondary", size="lg")
auto_remove_btn = gr.Button("REMOVE CAPTIONS", variant="primary", size="lg")
auto_log = gr.Textbox(label="System Log", interactive=False, lines=4)
auto_out = gr.File(label="Download Result")
with gr.Column(scale=1):
auto_preview_img = gr.Image(label="Detection Preview - red = pixels to be removed", type="pil", interactive=False, height=360)
auto_preview_btn.click(fn=auto_preview, inputs=[cap_video_upload, cap_timestamp], outputs=[auto_preview_img, auto_log])
auto_remove_btn.click(fn=auto_remove, inputs=[cap_video_upload, cap_timestamp, auto_method], outputs=[auto_out, auto_log])
with gr.Tab("✏️ MANUAL PAINT"):
with gr.Row(equal_height=False):
with gr.Column(scale=1):
manual_method = gr.Dropdown(label="Inpaint Method", choices=["hybrid","telea","ns","blur","black","bg"], value="hybrid")
manual_extract_btn = gr.Button("EXTRACT FRAME", variant="secondary", size="lg")
manual_remove_btn = gr.Button("REMOVE CAPTIONS", variant="primary", size="lg")
manual_log = gr.Textbox(label="System Log", interactive=False, lines=4)
manual_out = gr.File(label="Download Result")
with gr.Column(scale=2):
manual_editor = gr.ImageEditor(label="Paint over the caption text", type="pil", height=420,
brush=gr.Brush(colors=["#ff0000","#ffffff"], color_mode="fixed", default_size=10),
eraser=gr.Eraser(default_size=24), layers=False, interactive=True)
manual_extract_btn.click(fn=manual_extract_frame, inputs=[cap_video_upload, cap_timestamp], outputs=[manual_editor, manual_log])
manual_remove_btn.click(fn=manual_remove, inputs=[cap_video_upload, manual_editor, manual_method], outputs=[manual_out, manual_log])