Spaces:

rx66y
/

Yta-Tools-V2

Running

App Files Files Community

Yta-Tools-V2 / caption.py

rx66y

Upload 10 files

031627f verified 21 days ago

raw

history blame contribute delete

17.1 kB

	import os
	import time
	import subprocess
	import numpy as np
	import cv2
	import requests
	import gradio as gr
	from PIL import Image

	try:
	import onnxruntime as ort
	_ort_available = True
	except Exception:
	_ort_available = False

	YOLO_MODEL_URL = "https://github.com/hjunior29/video-text-remover/raw/main/models/text_detector/best.onnx"
	YOLO_MODEL_PATH = "./models/text_detector/best.onnx"
	_yolo_session = None

	def _load_yolo():
	global _yolo_session
	if _yolo_session is not None:
	return _yolo_session
	if not _ort_available:
	return None
	os.makedirs(os.path.dirname(YOLO_MODEL_PATH), exist_ok=True)
	if not os.path.exists(YOLO_MODEL_PATH):
	print("Downloading YOLO11 text detector model...")
	r = requests.get(YOLO_MODEL_URL, timeout=120, stream=True)
	r.raise_for_status()
	with open(YOLO_MODEL_PATH, "wb") as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)
	providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
	_yolo_session = ort.InferenceSession(YOLO_MODEL_PATH, providers=providers)
	return _yolo_session

	def _yolo_preprocess(frame_bgr, input_size=640):
	h, w = frame_bgr.shape[:2]
	scale = input_size / max(h, w)
	nh, nw = int(h * scale), int(w * scale)
	resized = cv2.resize(frame_bgr, (nw, nh))
	padded = np.zeros((input_size, input_size, 3), dtype=np.uint8)
	padded[:nh, :nw] = resized
	rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
	tensor = np.transpose(rgb, (2, 0, 1))[np.newaxis]
	return tensor, scale, nh, nw

	def _yolo_postprocess(outputs, scale, orig_h, orig_w, nw, nh, conf_thresh=0.25, margin=5):
	preds = outputs[0]
	if preds.ndim == 3 and preds.shape[1] < preds.shape[2]:
	preds = preds.transpose(0, 2, 1)
	preds = preds[0]
	mask = np.zeros((orig_h, orig_w), dtype=np.uint8)
	for det in preds:
	if det.shape[0] < 5: continue
	conf = float(det[4:].max())
	if conf < conf_thresh: continue
	cx, cy, bw, bh = det[:4]
	x1 = max(0, int((cx - bw/2) / scale) - margin)
	y1 = max(0, int((cy - bh/2) / scale) - margin)
	x2 = min(orig_w, int((cx + bw/2) / scale) + margin)
	y2 = min(orig_h, int((cy + bh/2) / scale) + margin)
	cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
	return mask

	def detect_text_mask_yolo(frame_bgr):
	try:
	sess = _load_yolo()
	if sess is None: return None
	tensor, scale, nh, nw = _yolo_preprocess(frame_bgr)
	input_name = sess.get_inputs()[0].name
	outputs = sess.run(None, {input_name: tensor})
	h, w = frame_bgr.shape[:2]
	return _yolo_postprocess(outputs, scale, h, w, nw, nh)
	except Exception as e:
	print(f"YOLO detect error: {e}")
	return None

	def detect_caption_bbox_fallback(frame_bgr):
	h, w = frame_bgr.shape[:2]
	gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
	best_box, best_area = None, 0
	for y_start, y_end in [(int(h0.65), h), (0, int(h0.12))]:
	if y_end <= y_start: continue
	region = gray[y_start:y_end, :]
	_, thresh = cv2.threshold(region, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	kern = cv2.getStructuringElement(cv2.MORPH_RECT, (w//8, 3))
	closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kern)
	contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	for cnt in contours:
	cx, cy, cw, ch2 = cv2.boundingRect(cnt)
	area = cw * ch2
	if cw < w0.2 or ch2 < 8 or ch2 > h0.25: continue
	if area > best_area:
	best_area = area
	pad = 6
	best_box = (max(0, cx-pad), max(0, y_start+cy-pad),
	min(w, cx+cw+pad2), min(h, y_start+cy+ch2+pad2))
	return best_box

	def get_video_dims(video_path):
	cap = cv2.VideoCapture(video_path)
	w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	cap.release()
	return w, h

	def extract_frame_at(video_path, timestamp_sec):
	if not video_path: return None, "Upload a video first."
	try:
	cap = cv2.VideoCapture(video_path)
	fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	total_secs = total_frames / fps
	ts = max(0.0, min(float(timestamp_sec), total_secs - 0.1))
	cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
	ret, frame = cap.read()
	cap.release()
	if not ret:
	cap2 = cv2.VideoCapture(video_path)
	ret, frame = cap2.read()
	cap2.release()
	if not ret: return None, "Could not read frame."
	return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)), \
	f"Frame at {ts:.1f}s loaded. Paint over captions then click REMOVE CAPTIONS."
	except Exception as e:
	return None, f"Error: {e}"

	def run_opencv_inpaint(abs_input, output_path, mask_img, method='hybrid'):
	try:
	vid_w, vid_h = get_video_dims(abs_input)
	temp_vid = output_path + "_tmp.mp4"
	temp_aud = output_path + "_tmp.aac"
	subprocess.run(["ffmpeg", "-y", "-i", abs_input, "-vn", "-c:a", "copy", temp_aud],
	stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	cap = cv2.VideoCapture(abs_input)
	fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
	fourcc = cv2.VideoWriter_fourcc(*"mp4v")
	out = cv2.VideoWriter(temp_vid, fourcc, fps, (vid_w, vid_h))
	mask = cv2.resize(mask_img, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST)
	frame_idx = 0
	t_start = time.time()
	while True:
	ret, frame = cap.read()
	if not ret: break
	if method == 'hybrid':
	kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
	expanded = cv2.dilate(mask, kernel, iterations=1)
	inpainted = cv2.inpaint(frame, expanded, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
	elif method == 'ns':
	inpainted = cv2.inpaint(frame, mask, inpaintRadius=3, flags=cv2.INPAINT_NS)
	elif method == 'blur':
	inpainted = frame.copy()
	blurred = cv2.GaussianBlur(frame, (51, 51), 30)
	inpainted[mask == 255] = blurred[mask == 255]
	elif method == 'black':
	inpainted = frame.copy()
	inpainted[mask == 255] = 0
	elif method == 'bg':
	inpainted = frame.copy()
	kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
	border = cv2.dilate(mask, kernel2) - mask
	mean_color = cv2.mean(frame, mask=border)[:3]
	inpainted[mask == 255] = [int(mean_color[0]), int(mean_color[1]), int(mean_color[2])]
	else:
	inpainted = cv2.inpaint(frame, mask, inpaintRadius=2, flags=cv2.INPAINT_TELEA)
	out.write(inpainted)
	frame_idx += 1
	elapsed = time.time() - t_start
	fps_rate = frame_idx / max(elapsed, 0.1)
	cap.release()
	out.release()
	has_audio = os.path.exists(temp_aud) and os.path.getsize(temp_aud) > 0
	if has_audio:
	subprocess.run(["ffmpeg", "-y", "-i", temp_vid, "-i", temp_aud,
	"-c:v", "libx264", "-preset", "fast", "-crf", "18",
	"-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart",
	output_path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	else:
	subprocess.run(["ffmpeg", "-y", "-i", temp_vid,
	"-c:v", "libx264", "-preset", "fast", "-crf", "18",
	"-movflags", "+faststart", output_path],
	check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	for f in [temp_vid, temp_aud]:
	if os.path.exists(f): os.remove(f)
	return True, f"{frame_idx} frames in {elapsed:.1f}s ({fps_rate:.1f} fps)"
	except Exception as e:
	import traceback
	return False, traceback.format_exc()

	def auto_preview(video_path, timestamp_sec=3.0):
	if not video_path: return None, "Upload a video first."
	try:
	ts = max(0.0, float(timestamp_sec))
	cap = cv2.VideoCapture(video_path)
	cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
	ret, frame = cap.read()
	cap.release()
	if not ret: return None, "Could not read frame."
	rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	h, w = frame.shape[:2]
	mask = detect_text_mask_yolo(frame)
	used = "YOLO11"
	if mask is None or mask.max() == 0:
	box = detect_caption_bbox_fallback(frame)
	used = "OpenCV fallback"
	if box is None:
	return Image.fromarray(rgb), "No captions detected. Try Manual mode or adjust timestamp."
	mask = np.zeros((h, w), dtype=np.uint8)
	x1, y1, x2, y2 = box
	cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
	preview = rgb.copy()
	overlay = rgb.copy()
	overlay[mask == 255] = [220, 40, 40]
	blended = cv2.addWeighted(preview, 0.5, overlay, 0.5, 0)
	n_pixels = int(mask.sum() / 255)
	return Image.fromarray(blended), f"{n_pixels} text pixels found via {used}. Red = will be removed."
	except Exception as e:
	return None, f"Error: {e}"

	def auto_remove(video_path, timestamp_sec=3.0, inpaint_method='hybrid', save_folder="./Caption-Removed/"):
	if not video_path: return None, "No video uploaded."
	os.makedirs(save_folder, exist_ok=True)
	abs_input = os.path.abspath(video_path)
	base_name = os.path.splitext(os.path.basename(video_path))[0]
	output_path = os.path.abspath(os.path.join(save_folder, f"{base_name}_NO_CAPTIONS.mp4"))
	try:
	cap = cv2.VideoCapture(abs_input)
	cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, float(timestamp_sec)) * 1000)
	ret, frame = cap.read()
	cap.release()
	if not ret: return None, "Could not read frame."
	vid_w, vid_h = get_video_dims(abs_input)
	mask = detect_text_mask_yolo(frame)
	used = "YOLO11"
	if mask is None or mask.max() == 0:
	box = detect_caption_bbox_fallback(frame)
	used = "OpenCV fallback"
	if box is None: return None, "No captions detected. Try Manual mode."
	mask = np.zeros((vid_h, vid_w), dtype=np.uint8)
	x1, y1, x2, y2 = box
	cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
	else:
	mask = cv2.resize(mask, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST)
	ok, info = run_opencv_inpaint(abs_input, output_path, mask, method=inpaint_method)
	if not ok: return None, f"Inpaint failed:\n{info}"
	size_mb = os.path.getsize(output_path) / (1024 * 1024)
	return output_path, f"Done!\nSIZE ▸ {size_mb:.1f} MB\nMETHOD ▸ {used} + {inpaint_method.upper()}\nTIME ▸ {info}"
	except Exception as e:
	import traceback
	return None, f"Error:\n{traceback.format_exc()}"

	def manual_extract_frame(video_path, timestamp_sec):
	return extract_frame_at(video_path, timestamp_sec)

	def manual_remove(video_path, editor_value, inpaint_method='hybrid', save_folder="./Caption-Removed/"):
	if not video_path: return None, "No video uploaded."
	if editor_value is None: return None, "Extract frame and paint over captions first."
	os.makedirs(save_folder, exist_ok=True)
	abs_input = os.path.abspath(video_path)
	base_name = os.path.splitext(os.path.basename(video_path))[0]
	output_path = os.path.abspath(os.path.join(save_folder, f"{base_name}_NO_CAPTIONS.mp4"))
	try:
	vid_w, vid_h = get_video_dims(abs_input)
	layers = editor_value.get("layers", [])
	composite = editor_value.get("composite")
	background = editor_value.get("background")
	mask = np.zeros((vid_h, vid_w), dtype=np.uint8)
	if layers:
	for layer in layers:
	if layer is None: continue
	lnp = np.array(layer.convert("RGBA"))
	lnp = cv2.resize(lnp, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST)
	mask[lnp[:, :, 3] > 10] = 255
	if mask.max() == 0 and composite is not None and background is not None:
	cnp = cv2.resize(np.array(composite.convert("RGB")), (vid_w, vid_h))
	bnp = cv2.resize(np.array(background.convert("RGB")), (vid_w, vid_h))
	diff = cv2.cvtColor(cv2.absdiff(cnp, bnp), cv2.COLOR_RGB2GRAY)
	mask[diff > 15] = 255
	if mask.max() == 0: return None, "No strokes detected. Paint over the captions."
	ok, info = run_opencv_inpaint(abs_input, output_path, mask, method=inpaint_method)
	if not ok: return None, f"Inpaint failed:\n{info}"
	size_mb = os.path.getsize(output_path) / (1024 * 1024)
	return output_path, f"Done!\nSIZE ▸ {size_mb:.1f} MB\nMETHOD ▸ Manual + {inpaint_method.upper()}\nTIME ▸ {info}"
	except Exception as e:
	import traceback
	return None, f"Error:\n{traceback.format_exc()}"

	def build_tab():
	with gr.Tab("🖌️ CAPTION REMOVER"):
	gr.HTML("""<div style="padding:16px 4px 6px;">
	<div style="font-family:'Orbitron',sans-serif;font-size:.65rem;font-weight:700;color:#00d4ff;letter-spacing:.2em;text-transform:uppercase;margin-bottom:6px;">YOLO11 + OpenCV Inpainting</div>
	<div style="font-family:'Share Tech Mono',monospace;font-size:.75rem;color:#2a5570;line-height:1.9;">
	Upload video, scrub to where captions appear, use Auto or Manual.<br>
	<span style="color:#ff3c6e;">Complex backgrounds may still show minor artifacts on CPU.</span>
	</div></div>""")
	with gr.Row():
	with gr.Column(scale=1):
	cap_video_upload = gr.File(label="Upload Video", file_types=[".mp4",".mov",".mkv",".avi",".webm"])
	with gr.Column(scale=2):
	cap_video_player = gr.Video(label="Preview - scrub to find captions", interactive=False, height=300)
	cap_video_upload.change(fn=lambda f: f, inputs=[cap_video_upload], outputs=[cap_video_player])
	cap_timestamp = gr.Slider(label="Timestamp to sample caption detection (seconds)", minimum=0, maximum=300, value=3, step=0.5)
	gr.HTML("<div style='height:1px;background:#0d2137;margin:8px 0;'></div>")
	with gr.Tabs():
	with gr.Tab("🤖 AUTO DETECT"):
	with gr.Row(equal_height=True):
	with gr.Column(scale=1):
	auto_method = gr.Dropdown(label="Inpaint Method", choices=["hybrid","telea","ns","blur","black","bg"], value="hybrid")
	auto_preview_btn = gr.Button("PREVIEW DETECTION", variant="secondary", size="lg")
	auto_remove_btn = gr.Button("REMOVE CAPTIONS", variant="primary", size="lg")
	auto_log = gr.Textbox(label="System Log", interactive=False, lines=4)
	auto_out = gr.File(label="Download Result")
	with gr.Column(scale=1):
	auto_preview_img = gr.Image(label="Detection Preview - red = pixels to be removed", type="pil", interactive=False, height=360)
	auto_preview_btn.click(fn=auto_preview, inputs=[cap_video_upload, cap_timestamp], outputs=[auto_preview_img, auto_log])
	auto_remove_btn.click(fn=auto_remove, inputs=[cap_video_upload, cap_timestamp, auto_method], outputs=[auto_out, auto_log])
	with gr.Tab("✏️ MANUAL PAINT"):
	with gr.Row(equal_height=False):
	with gr.Column(scale=1):
	manual_method = gr.Dropdown(label="Inpaint Method", choices=["hybrid","telea","ns","blur","black","bg"], value="hybrid")
	manual_extract_btn = gr.Button("EXTRACT FRAME", variant="secondary", size="lg")
	manual_remove_btn = gr.Button("REMOVE CAPTIONS", variant="primary", size="lg")
	manual_log = gr.Textbox(label="System Log", interactive=False, lines=4)
	manual_out = gr.File(label="Download Result")
	with gr.Column(scale=2):
	manual_editor = gr.ImageEditor(label="Paint over the caption text", type="pil", height=420,
	brush=gr.Brush(colors=["#ff0000","#ffffff"], color_mode="fixed", default_size=10),
	eraser=gr.Eraser(default_size=24), layers=False, interactive=True)
	manual_extract_btn.click(fn=manual_extract_frame, inputs=[cap_video_upload, cap_timestamp], outputs=[manual_editor, manual_log])
	manual_remove_btn.click(fn=manual_remove, inputs=[cap_video_upload, manual_editor, manual_method], outputs=[manual_out, manual_log])