import os import time import subprocess import numpy as np import cv2 import requests import gradio as gr from PIL import Image try: import onnxruntime as ort _ort_available = True except Exception: _ort_available = False YOLO_MODEL_URL = "https://github.com/hjunior29/video-text-remover/raw/main/models/text_detector/best.onnx" YOLO_MODEL_PATH = "./models/text_detector/best.onnx" _yolo_session = None def _load_yolo(): global _yolo_session if _yolo_session is not None: return _yolo_session if not _ort_available: return None os.makedirs(os.path.dirname(YOLO_MODEL_PATH), exist_ok=True) if not os.path.exists(YOLO_MODEL_PATH): print("Downloading YOLO11 text detector model...") r = requests.get(YOLO_MODEL_URL, timeout=120, stream=True) r.raise_for_status() with open(YOLO_MODEL_PATH, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] _yolo_session = ort.InferenceSession(YOLO_MODEL_PATH, providers=providers) return _yolo_session def _yolo_preprocess(frame_bgr, input_size=640): h, w = frame_bgr.shape[:2] scale = input_size / max(h, w) nh, nw = int(h * scale), int(w * scale) resized = cv2.resize(frame_bgr, (nw, nh)) padded = np.zeros((input_size, input_size, 3), dtype=np.uint8) padded[:nh, :nw] = resized rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0 tensor = np.transpose(rgb, (2, 0, 1))[np.newaxis] return tensor, scale, nh, nw def _yolo_postprocess(outputs, scale, orig_h, orig_w, nw, nh, conf_thresh=0.25, margin=5): preds = outputs[0] if preds.ndim == 3 and preds.shape[1] < preds.shape[2]: preds = preds.transpose(0, 2, 1) preds = preds[0] mask = np.zeros((orig_h, orig_w), dtype=np.uint8) for det in preds: if det.shape[0] < 5: continue conf = float(det[4:].max()) if conf < conf_thresh: continue cx, cy, bw, bh = det[:4] x1 = max(0, int((cx - bw/2) / scale) - margin) y1 = max(0, int((cy - bh/2) / scale) - margin) x2 = min(orig_w, int((cx + bw/2) / scale) + margin) y2 = min(orig_h, int((cy + bh/2) / scale) + margin) cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) return mask def detect_text_mask_yolo(frame_bgr): try: sess = _load_yolo() if sess is None: return None tensor, scale, nh, nw = _yolo_preprocess(frame_bgr) input_name = sess.get_inputs()[0].name outputs = sess.run(None, {input_name: tensor}) h, w = frame_bgr.shape[:2] return _yolo_postprocess(outputs, scale, h, w, nw, nh) except Exception as e: print(f"YOLO detect error: {e}") return None def detect_caption_bbox_fallback(frame_bgr): h, w = frame_bgr.shape[:2] gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) best_box, best_area = None, 0 for y_start, y_end in [(int(h*0.65), h), (0, int(h*0.12))]: if y_end <= y_start: continue region = gray[y_start:y_end, :] _, thresh = cv2.threshold(region, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) kern = cv2.getStructuringElement(cv2.MORPH_RECT, (w//8, 3)) closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kern) contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for cnt in contours: cx, cy, cw, ch2 = cv2.boundingRect(cnt) area = cw * ch2 if cw < w*0.2 or ch2 < 8 or ch2 > h*0.25: continue if area > best_area: best_area = area pad = 6 best_box = (max(0, cx-pad), max(0, y_start+cy-pad), min(w, cx+cw+pad*2), min(h, y_start+cy+ch2+pad*2)) return best_box def get_video_dims(video_path): cap = cv2.VideoCapture(video_path) w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) cap.release() return w, h def extract_frame_at(video_path, timestamp_sec): if not video_path: return None, "Upload a video first." try: cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) or 30.0 total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) total_secs = total_frames / fps ts = max(0.0, min(float(timestamp_sec), total_secs - 0.1)) cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000) ret, frame = cap.read() cap.release() if not ret: cap2 = cv2.VideoCapture(video_path) ret, frame = cap2.read() cap2.release() if not ret: return None, "Could not read frame." return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)), \ f"Frame at {ts:.1f}s loaded. Paint over captions then click REMOVE CAPTIONS." except Exception as e: return None, f"Error: {e}" def run_opencv_inpaint(abs_input, output_path, mask_img, method='hybrid'): try: vid_w, vid_h = get_video_dims(abs_input) temp_vid = output_path + "_tmp.mp4" temp_aud = output_path + "_tmp.aac" subprocess.run(["ffmpeg", "-y", "-i", abs_input, "-vn", "-c:a", "copy", temp_aud], stdout=subprocess.PIPE, stderr=subprocess.PIPE) cap = cv2.VideoCapture(abs_input) fps = cap.get(cv2.CAP_PROP_FPS) or 30.0 fourcc = cv2.VideoWriter_fourcc(*"mp4v") out = cv2.VideoWriter(temp_vid, fourcc, fps, (vid_w, vid_h)) mask = cv2.resize(mask_img, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST) frame_idx = 0 t_start = time.time() while True: ret, frame = cap.read() if not ret: break if method == 'hybrid': kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) expanded = cv2.dilate(mask, kernel, iterations=1) inpainted = cv2.inpaint(frame, expanded, inpaintRadius=3, flags=cv2.INPAINT_TELEA) elif method == 'ns': inpainted = cv2.inpaint(frame, mask, inpaintRadius=3, flags=cv2.INPAINT_NS) elif method == 'blur': inpainted = frame.copy() blurred = cv2.GaussianBlur(frame, (51, 51), 30) inpainted[mask == 255] = blurred[mask == 255] elif method == 'black': inpainted = frame.copy() inpainted[mask == 255] = 0 elif method == 'bg': inpainted = frame.copy() kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10)) border = cv2.dilate(mask, kernel2) - mask mean_color = cv2.mean(frame, mask=border)[:3] inpainted[mask == 255] = [int(mean_color[0]), int(mean_color[1]), int(mean_color[2])] else: inpainted = cv2.inpaint(frame, mask, inpaintRadius=2, flags=cv2.INPAINT_TELEA) out.write(inpainted) frame_idx += 1 elapsed = time.time() - t_start fps_rate = frame_idx / max(elapsed, 0.1) cap.release() out.release() has_audio = os.path.exists(temp_aud) and os.path.getsize(temp_aud) > 0 if has_audio: subprocess.run(["ffmpeg", "-y", "-i", temp_vid, "-i", temp_aud, "-c:v", "libx264", "-preset", "fast", "-crf", "18", "-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart", output_path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: subprocess.run(["ffmpeg", "-y", "-i", temp_vid, "-c:v", "libx264", "-preset", "fast", "-crf", "18", "-movflags", "+faststart", output_path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) for f in [temp_vid, temp_aud]: if os.path.exists(f): os.remove(f) return True, f"{frame_idx} frames in {elapsed:.1f}s ({fps_rate:.1f} fps)" except Exception as e: import traceback return False, traceback.format_exc() def auto_preview(video_path, timestamp_sec=3.0): if not video_path: return None, "Upload a video first." try: ts = max(0.0, float(timestamp_sec)) cap = cv2.VideoCapture(video_path) cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000) ret, frame = cap.read() cap.release() if not ret: return None, "Could not read frame." rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) h, w = frame.shape[:2] mask = detect_text_mask_yolo(frame) used = "YOLO11" if mask is None or mask.max() == 0: box = detect_caption_bbox_fallback(frame) used = "OpenCV fallback" if box is None: return Image.fromarray(rgb), "No captions detected. Try Manual mode or adjust timestamp." mask = np.zeros((h, w), dtype=np.uint8) x1, y1, x2, y2 = box cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) preview = rgb.copy() overlay = rgb.copy() overlay[mask == 255] = [220, 40, 40] blended = cv2.addWeighted(preview, 0.5, overlay, 0.5, 0) n_pixels = int(mask.sum() / 255) return Image.fromarray(blended), f"{n_pixels} text pixels found via {used}. Red = will be removed." except Exception as e: return None, f"Error: {e}" def auto_remove(video_path, timestamp_sec=3.0, inpaint_method='hybrid', save_folder="./Caption-Removed/"): if not video_path: return None, "No video uploaded." os.makedirs(save_folder, exist_ok=True) abs_input = os.path.abspath(video_path) base_name = os.path.splitext(os.path.basename(video_path))[0] output_path = os.path.abspath(os.path.join(save_folder, f"{base_name}_NO_CAPTIONS.mp4")) try: cap = cv2.VideoCapture(abs_input) cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, float(timestamp_sec)) * 1000) ret, frame = cap.read() cap.release() if not ret: return None, "Could not read frame." vid_w, vid_h = get_video_dims(abs_input) mask = detect_text_mask_yolo(frame) used = "YOLO11" if mask is None or mask.max() == 0: box = detect_caption_bbox_fallback(frame) used = "OpenCV fallback" if box is None: return None, "No captions detected. Try Manual mode." mask = np.zeros((vid_h, vid_w), dtype=np.uint8) x1, y1, x2, y2 = box cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) else: mask = cv2.resize(mask, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST) ok, info = run_opencv_inpaint(abs_input, output_path, mask, method=inpaint_method) if not ok: return None, f"Inpaint failed:\n{info}" size_mb = os.path.getsize(output_path) / (1024 * 1024) return output_path, f"Done!\nSIZE ▸ {size_mb:.1f} MB\nMETHOD ▸ {used} + {inpaint_method.upper()}\nTIME ▸ {info}" except Exception as e: import traceback return None, f"Error:\n{traceback.format_exc()}" def manual_extract_frame(video_path, timestamp_sec): return extract_frame_at(video_path, timestamp_sec) def manual_remove(video_path, editor_value, inpaint_method='hybrid', save_folder="./Caption-Removed/"): if not video_path: return None, "No video uploaded." if editor_value is None: return None, "Extract frame and paint over captions first." os.makedirs(save_folder, exist_ok=True) abs_input = os.path.abspath(video_path) base_name = os.path.splitext(os.path.basename(video_path))[0] output_path = os.path.abspath(os.path.join(save_folder, f"{base_name}_NO_CAPTIONS.mp4")) try: vid_w, vid_h = get_video_dims(abs_input) layers = editor_value.get("layers", []) composite = editor_value.get("composite") background = editor_value.get("background") mask = np.zeros((vid_h, vid_w), dtype=np.uint8) if layers: for layer in layers: if layer is None: continue lnp = np.array(layer.convert("RGBA")) lnp = cv2.resize(lnp, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST) mask[lnp[:, :, 3] > 10] = 255 if mask.max() == 0 and composite is not None and background is not None: cnp = cv2.resize(np.array(composite.convert("RGB")), (vid_w, vid_h)) bnp = cv2.resize(np.array(background.convert("RGB")), (vid_w, vid_h)) diff = cv2.cvtColor(cv2.absdiff(cnp, bnp), cv2.COLOR_RGB2GRAY) mask[diff > 15] = 255 if mask.max() == 0: return None, "No strokes detected. Paint over the captions." ok, info = run_opencv_inpaint(abs_input, output_path, mask, method=inpaint_method) if not ok: return None, f"Inpaint failed:\n{info}" size_mb = os.path.getsize(output_path) / (1024 * 1024) return output_path, f"Done!\nSIZE ▸ {size_mb:.1f} MB\nMETHOD ▸ Manual + {inpaint_method.upper()}\nTIME ▸ {info}" except Exception as e: import traceback return None, f"Error:\n{traceback.format_exc()}" def build_tab(): with gr.Tab("🖌️ CAPTION REMOVER"): gr.HTML("""
YOLO11 + OpenCV Inpainting
Upload video, scrub to where captions appear, use Auto or Manual.
Complex backgrounds may still show minor artifacts on CPU.
""") with gr.Row(): with gr.Column(scale=1): cap_video_upload = gr.File(label="Upload Video", file_types=[".mp4",".mov",".mkv",".avi",".webm"]) with gr.Column(scale=2): cap_video_player = gr.Video(label="Preview - scrub to find captions", interactive=False, height=300) cap_video_upload.change(fn=lambda f: f, inputs=[cap_video_upload], outputs=[cap_video_player]) cap_timestamp = gr.Slider(label="Timestamp to sample caption detection (seconds)", minimum=0, maximum=300, value=3, step=0.5) gr.HTML("
") with gr.Tabs(): with gr.Tab("🤖 AUTO DETECT"): with gr.Row(equal_height=True): with gr.Column(scale=1): auto_method = gr.Dropdown(label="Inpaint Method", choices=["hybrid","telea","ns","blur","black","bg"], value="hybrid") auto_preview_btn = gr.Button("PREVIEW DETECTION", variant="secondary", size="lg") auto_remove_btn = gr.Button("REMOVE CAPTIONS", variant="primary", size="lg") auto_log = gr.Textbox(label="System Log", interactive=False, lines=4) auto_out = gr.File(label="Download Result") with gr.Column(scale=1): auto_preview_img = gr.Image(label="Detection Preview - red = pixels to be removed", type="pil", interactive=False, height=360) auto_preview_btn.click(fn=auto_preview, inputs=[cap_video_upload, cap_timestamp], outputs=[auto_preview_img, auto_log]) auto_remove_btn.click(fn=auto_remove, inputs=[cap_video_upload, cap_timestamp, auto_method], outputs=[auto_out, auto_log]) with gr.Tab("✏️ MANUAL PAINT"): with gr.Row(equal_height=False): with gr.Column(scale=1): manual_method = gr.Dropdown(label="Inpaint Method", choices=["hybrid","telea","ns","blur","black","bg"], value="hybrid") manual_extract_btn = gr.Button("EXTRACT FRAME", variant="secondary", size="lg") manual_remove_btn = gr.Button("REMOVE CAPTIONS", variant="primary", size="lg") manual_log = gr.Textbox(label="System Log", interactive=False, lines=4) manual_out = gr.File(label="Download Result") with gr.Column(scale=2): manual_editor = gr.ImageEditor(label="Paint over the caption text", type="pil", height=420, brush=gr.Brush(colors=["#ff0000","#ffffff"], color_mode="fixed", default_size=10), eraser=gr.Eraser(default_size=24), layers=False, interactive=True) manual_extract_btn.click(fn=manual_extract_frame, inputs=[cap_video_upload, cap_timestamp], outputs=[manual_editor, manual_log]) manual_remove_btn.click(fn=manual_remove, inputs=[cap_video_upload, manual_editor, manual_method], outputs=[manual_out, manual_log])