Spaces:
Running
Running
| import os | |
| import time | |
| import subprocess | |
| import numpy as np | |
| import cv2 | |
| import requests | |
| import gradio as gr | |
| from PIL import Image | |
| try: | |
| import onnxruntime as ort | |
| _ort_available = True | |
| except Exception: | |
| _ort_available = False | |
| YOLO_MODEL_URL = "https://github.com/hjunior29/video-text-remover/raw/main/models/text_detector/best.onnx" | |
| YOLO_MODEL_PATH = "./models/text_detector/best.onnx" | |
| _yolo_session = None | |
| def _load_yolo(): | |
| global _yolo_session | |
| if _yolo_session is not None: | |
| return _yolo_session | |
| if not _ort_available: | |
| return None | |
| os.makedirs(os.path.dirname(YOLO_MODEL_PATH), exist_ok=True) | |
| if not os.path.exists(YOLO_MODEL_PATH): | |
| print("Downloading YOLO11 text detector model...") | |
| r = requests.get(YOLO_MODEL_URL, timeout=120, stream=True) | |
| r.raise_for_status() | |
| with open(YOLO_MODEL_PATH, "wb") as f: | |
| for chunk in r.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] | |
| _yolo_session = ort.InferenceSession(YOLO_MODEL_PATH, providers=providers) | |
| return _yolo_session | |
| def _yolo_preprocess(frame_bgr, input_size=640): | |
| h, w = frame_bgr.shape[:2] | |
| scale = input_size / max(h, w) | |
| nh, nw = int(h * scale), int(w * scale) | |
| resized = cv2.resize(frame_bgr, (nw, nh)) | |
| padded = np.zeros((input_size, input_size, 3), dtype=np.uint8) | |
| padded[:nh, :nw] = resized | |
| rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0 | |
| tensor = np.transpose(rgb, (2, 0, 1))[np.newaxis] | |
| return tensor, scale, nh, nw | |
| def _yolo_postprocess(outputs, scale, orig_h, orig_w, nw, nh, conf_thresh=0.25, margin=5): | |
| preds = outputs[0] | |
| if preds.ndim == 3 and preds.shape[1] < preds.shape[2]: | |
| preds = preds.transpose(0, 2, 1) | |
| preds = preds[0] | |
| mask = np.zeros((orig_h, orig_w), dtype=np.uint8) | |
| for det in preds: | |
| if det.shape[0] < 5: continue | |
| conf = float(det[4:].max()) | |
| if conf < conf_thresh: continue | |
| cx, cy, bw, bh = det[:4] | |
| x1 = max(0, int((cx - bw/2) / scale) - margin) | |
| y1 = max(0, int((cy - bh/2) / scale) - margin) | |
| x2 = min(orig_w, int((cx + bw/2) / scale) + margin) | |
| y2 = min(orig_h, int((cy + bh/2) / scale) + margin) | |
| cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) | |
| return mask | |
| def detect_text_mask_yolo(frame_bgr): | |
| try: | |
| sess = _load_yolo() | |
| if sess is None: return None | |
| tensor, scale, nh, nw = _yolo_preprocess(frame_bgr) | |
| input_name = sess.get_inputs()[0].name | |
| outputs = sess.run(None, {input_name: tensor}) | |
| h, w = frame_bgr.shape[:2] | |
| return _yolo_postprocess(outputs, scale, h, w, nw, nh) | |
| except Exception as e: | |
| print(f"YOLO detect error: {e}") | |
| return None | |
| def detect_caption_bbox_fallback(frame_bgr): | |
| h, w = frame_bgr.shape[:2] | |
| gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) | |
| best_box, best_area = None, 0 | |
| for y_start, y_end in [(int(h*0.65), h), (0, int(h*0.12))]: | |
| if y_end <= y_start: continue | |
| region = gray[y_start:y_end, :] | |
| _, thresh = cv2.threshold(region, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| kern = cv2.getStructuringElement(cv2.MORPH_RECT, (w//8, 3)) | |
| closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kern) | |
| contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
| for cnt in contours: | |
| cx, cy, cw, ch2 = cv2.boundingRect(cnt) | |
| area = cw * ch2 | |
| if cw < w*0.2 or ch2 < 8 or ch2 > h*0.25: continue | |
| if area > best_area: | |
| best_area = area | |
| pad = 6 | |
| best_box = (max(0, cx-pad), max(0, y_start+cy-pad), | |
| min(w, cx+cw+pad*2), min(h, y_start+cy+ch2+pad*2)) | |
| return best_box | |
| def get_video_dims(video_path): | |
| cap = cv2.VideoCapture(video_path) | |
| w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) | |
| h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
| cap.release() | |
| return w, h | |
| def extract_frame_at(video_path, timestamp_sec): | |
| if not video_path: return None, "Upload a video first." | |
| try: | |
| cap = cv2.VideoCapture(video_path) | |
| fps = cap.get(cv2.CAP_PROP_FPS) or 30.0 | |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| total_secs = total_frames / fps | |
| ts = max(0.0, min(float(timestamp_sec), total_secs - 0.1)) | |
| cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000) | |
| ret, frame = cap.read() | |
| cap.release() | |
| if not ret: | |
| cap2 = cv2.VideoCapture(video_path) | |
| ret, frame = cap2.read() | |
| cap2.release() | |
| if not ret: return None, "Could not read frame." | |
| return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)), \ | |
| f"Frame at {ts:.1f}s loaded. Paint over captions then click REMOVE CAPTIONS." | |
| except Exception as e: | |
| return None, f"Error: {e}" | |
| def run_opencv_inpaint(abs_input, output_path, mask_img, method='hybrid'): | |
| try: | |
| vid_w, vid_h = get_video_dims(abs_input) | |
| temp_vid = output_path + "_tmp.mp4" | |
| temp_aud = output_path + "_tmp.aac" | |
| subprocess.run(["ffmpeg", "-y", "-i", abs_input, "-vn", "-c:a", "copy", temp_aud], | |
| stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| cap = cv2.VideoCapture(abs_input) | |
| fps = cap.get(cv2.CAP_PROP_FPS) or 30.0 | |
| fourcc = cv2.VideoWriter_fourcc(*"mp4v") | |
| out = cv2.VideoWriter(temp_vid, fourcc, fps, (vid_w, vid_h)) | |
| mask = cv2.resize(mask_img, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST) | |
| frame_idx = 0 | |
| t_start = time.time() | |
| while True: | |
| ret, frame = cap.read() | |
| if not ret: break | |
| if method == 'hybrid': | |
| kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) | |
| expanded = cv2.dilate(mask, kernel, iterations=1) | |
| inpainted = cv2.inpaint(frame, expanded, inpaintRadius=3, flags=cv2.INPAINT_TELEA) | |
| elif method == 'ns': | |
| inpainted = cv2.inpaint(frame, mask, inpaintRadius=3, flags=cv2.INPAINT_NS) | |
| elif method == 'blur': | |
| inpainted = frame.copy() | |
| blurred = cv2.GaussianBlur(frame, (51, 51), 30) | |
| inpainted[mask == 255] = blurred[mask == 255] | |
| elif method == 'black': | |
| inpainted = frame.copy() | |
| inpainted[mask == 255] = 0 | |
| elif method == 'bg': | |
| inpainted = frame.copy() | |
| kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10)) | |
| border = cv2.dilate(mask, kernel2) - mask | |
| mean_color = cv2.mean(frame, mask=border)[:3] | |
| inpainted[mask == 255] = [int(mean_color[0]), int(mean_color[1]), int(mean_color[2])] | |
| else: | |
| inpainted = cv2.inpaint(frame, mask, inpaintRadius=2, flags=cv2.INPAINT_TELEA) | |
| out.write(inpainted) | |
| frame_idx += 1 | |
| elapsed = time.time() - t_start | |
| fps_rate = frame_idx / max(elapsed, 0.1) | |
| cap.release() | |
| out.release() | |
| has_audio = os.path.exists(temp_aud) and os.path.getsize(temp_aud) > 0 | |
| if has_audio: | |
| subprocess.run(["ffmpeg", "-y", "-i", temp_vid, "-i", temp_aud, | |
| "-c:v", "libx264", "-preset", "fast", "-crf", "18", | |
| "-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart", | |
| output_path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| else: | |
| subprocess.run(["ffmpeg", "-y", "-i", temp_vid, | |
| "-c:v", "libx264", "-preset", "fast", "-crf", "18", | |
| "-movflags", "+faststart", output_path], | |
| check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| for f in [temp_vid, temp_aud]: | |
| if os.path.exists(f): os.remove(f) | |
| return True, f"{frame_idx} frames in {elapsed:.1f}s ({fps_rate:.1f} fps)" | |
| except Exception as e: | |
| import traceback | |
| return False, traceback.format_exc() | |
| def auto_preview(video_path, timestamp_sec=3.0): | |
| if not video_path: return None, "Upload a video first." | |
| try: | |
| ts = max(0.0, float(timestamp_sec)) | |
| cap = cv2.VideoCapture(video_path) | |
| cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000) | |
| ret, frame = cap.read() | |
| cap.release() | |
| if not ret: return None, "Could not read frame." | |
| rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| h, w = frame.shape[:2] | |
| mask = detect_text_mask_yolo(frame) | |
| used = "YOLO11" | |
| if mask is None or mask.max() == 0: | |
| box = detect_caption_bbox_fallback(frame) | |
| used = "OpenCV fallback" | |
| if box is None: | |
| return Image.fromarray(rgb), "No captions detected. Try Manual mode or adjust timestamp." | |
| mask = np.zeros((h, w), dtype=np.uint8) | |
| x1, y1, x2, y2 = box | |
| cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) | |
| preview = rgb.copy() | |
| overlay = rgb.copy() | |
| overlay[mask == 255] = [220, 40, 40] | |
| blended = cv2.addWeighted(preview, 0.5, overlay, 0.5, 0) | |
| n_pixels = int(mask.sum() / 255) | |
| return Image.fromarray(blended), f"{n_pixels} text pixels found via {used}. Red = will be removed." | |
| except Exception as e: | |
| return None, f"Error: {e}" | |
| def auto_remove(video_path, timestamp_sec=3.0, inpaint_method='hybrid', save_folder="./Caption-Removed/"): | |
| if not video_path: return None, "No video uploaded." | |
| os.makedirs(save_folder, exist_ok=True) | |
| abs_input = os.path.abspath(video_path) | |
| base_name = os.path.splitext(os.path.basename(video_path))[0] | |
| output_path = os.path.abspath(os.path.join(save_folder, f"{base_name}_NO_CAPTIONS.mp4")) | |
| try: | |
| cap = cv2.VideoCapture(abs_input) | |
| cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, float(timestamp_sec)) * 1000) | |
| ret, frame = cap.read() | |
| cap.release() | |
| if not ret: return None, "Could not read frame." | |
| vid_w, vid_h = get_video_dims(abs_input) | |
| mask = detect_text_mask_yolo(frame) | |
| used = "YOLO11" | |
| if mask is None or mask.max() == 0: | |
| box = detect_caption_bbox_fallback(frame) | |
| used = "OpenCV fallback" | |
| if box is None: return None, "No captions detected. Try Manual mode." | |
| mask = np.zeros((vid_h, vid_w), dtype=np.uint8) | |
| x1, y1, x2, y2 = box | |
| cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1) | |
| else: | |
| mask = cv2.resize(mask, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST) | |
| ok, info = run_opencv_inpaint(abs_input, output_path, mask, method=inpaint_method) | |
| if not ok: return None, f"Inpaint failed:\n{info}" | |
| size_mb = os.path.getsize(output_path) / (1024 * 1024) | |
| return output_path, f"Done!\nSIZE ▸ {size_mb:.1f} MB\nMETHOD ▸ {used} + {inpaint_method.upper()}\nTIME ▸ {info}" | |
| except Exception as e: | |
| import traceback | |
| return None, f"Error:\n{traceback.format_exc()}" | |
| def manual_extract_frame(video_path, timestamp_sec): | |
| return extract_frame_at(video_path, timestamp_sec) | |
| def manual_remove(video_path, editor_value, inpaint_method='hybrid', save_folder="./Caption-Removed/"): | |
| if not video_path: return None, "No video uploaded." | |
| if editor_value is None: return None, "Extract frame and paint over captions first." | |
| os.makedirs(save_folder, exist_ok=True) | |
| abs_input = os.path.abspath(video_path) | |
| base_name = os.path.splitext(os.path.basename(video_path))[0] | |
| output_path = os.path.abspath(os.path.join(save_folder, f"{base_name}_NO_CAPTIONS.mp4")) | |
| try: | |
| vid_w, vid_h = get_video_dims(abs_input) | |
| layers = editor_value.get("layers", []) | |
| composite = editor_value.get("composite") | |
| background = editor_value.get("background") | |
| mask = np.zeros((vid_h, vid_w), dtype=np.uint8) | |
| if layers: | |
| for layer in layers: | |
| if layer is None: continue | |
| lnp = np.array(layer.convert("RGBA")) | |
| lnp = cv2.resize(lnp, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST) | |
| mask[lnp[:, :, 3] > 10] = 255 | |
| if mask.max() == 0 and composite is not None and background is not None: | |
| cnp = cv2.resize(np.array(composite.convert("RGB")), (vid_w, vid_h)) | |
| bnp = cv2.resize(np.array(background.convert("RGB")), (vid_w, vid_h)) | |
| diff = cv2.cvtColor(cv2.absdiff(cnp, bnp), cv2.COLOR_RGB2GRAY) | |
| mask[diff > 15] = 255 | |
| if mask.max() == 0: return None, "No strokes detected. Paint over the captions." | |
| ok, info = run_opencv_inpaint(abs_input, output_path, mask, method=inpaint_method) | |
| if not ok: return None, f"Inpaint failed:\n{info}" | |
| size_mb = os.path.getsize(output_path) / (1024 * 1024) | |
| return output_path, f"Done!\nSIZE ▸ {size_mb:.1f} MB\nMETHOD ▸ Manual + {inpaint_method.upper()}\nTIME ▸ {info}" | |
| except Exception as e: | |
| import traceback | |
| return None, f"Error:\n{traceback.format_exc()}" | |
| def build_tab(): | |
| with gr.Tab("🖌️ CAPTION REMOVER"): | |
| gr.HTML("""<div style="padding:16px 4px 6px;"> | |
| <div style="font-family:'Orbitron',sans-serif;font-size:.65rem;font-weight:700;color:#00d4ff;letter-spacing:.2em;text-transform:uppercase;margin-bottom:6px;">YOLO11 + OpenCV Inpainting</div> | |
| <div style="font-family:'Share Tech Mono',monospace;font-size:.75rem;color:#2a5570;line-height:1.9;"> | |
| Upload video, scrub to where captions appear, use Auto or Manual.<br> | |
| <span style="color:#ff3c6e;">Complex backgrounds may still show minor artifacts on CPU.</span> | |
| </div></div>""") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| cap_video_upload = gr.File(label="Upload Video", file_types=[".mp4",".mov",".mkv",".avi",".webm"]) | |
| with gr.Column(scale=2): | |
| cap_video_player = gr.Video(label="Preview - scrub to find captions", interactive=False, height=300) | |
| cap_video_upload.change(fn=lambda f: f, inputs=[cap_video_upload], outputs=[cap_video_player]) | |
| cap_timestamp = gr.Slider(label="Timestamp to sample caption detection (seconds)", minimum=0, maximum=300, value=3, step=0.5) | |
| gr.HTML("<div style='height:1px;background:#0d2137;margin:8px 0;'></div>") | |
| with gr.Tabs(): | |
| with gr.Tab("🤖 AUTO DETECT"): | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| auto_method = gr.Dropdown(label="Inpaint Method", choices=["hybrid","telea","ns","blur","black","bg"], value="hybrid") | |
| auto_preview_btn = gr.Button("PREVIEW DETECTION", variant="secondary", size="lg") | |
| auto_remove_btn = gr.Button("REMOVE CAPTIONS", variant="primary", size="lg") | |
| auto_log = gr.Textbox(label="System Log", interactive=False, lines=4) | |
| auto_out = gr.File(label="Download Result") | |
| with gr.Column(scale=1): | |
| auto_preview_img = gr.Image(label="Detection Preview - red = pixels to be removed", type="pil", interactive=False, height=360) | |
| auto_preview_btn.click(fn=auto_preview, inputs=[cap_video_upload, cap_timestamp], outputs=[auto_preview_img, auto_log]) | |
| auto_remove_btn.click(fn=auto_remove, inputs=[cap_video_upload, cap_timestamp, auto_method], outputs=[auto_out, auto_log]) | |
| with gr.Tab("✏️ MANUAL PAINT"): | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1): | |
| manual_method = gr.Dropdown(label="Inpaint Method", choices=["hybrid","telea","ns","blur","black","bg"], value="hybrid") | |
| manual_extract_btn = gr.Button("EXTRACT FRAME", variant="secondary", size="lg") | |
| manual_remove_btn = gr.Button("REMOVE CAPTIONS", variant="primary", size="lg") | |
| manual_log = gr.Textbox(label="System Log", interactive=False, lines=4) | |
| manual_out = gr.File(label="Download Result") | |
| with gr.Column(scale=2): | |
| manual_editor = gr.ImageEditor(label="Paint over the caption text", type="pil", height=420, | |
| brush=gr.Brush(colors=["#ff0000","#ffffff"], color_mode="fixed", default_size=10), | |
| eraser=gr.Eraser(default_size=24), layers=False, interactive=True) | |
| manual_extract_btn.click(fn=manual_extract_frame, inputs=[cap_video_upload, cap_timestamp], outputs=[manual_editor, manual_log]) | |
| manual_remove_btn.click(fn=manual_remove, inputs=[cap_video_upload, manual_editor, manual_method], outputs=[manual_out, manual_log]) | |