import os
import time
import subprocess
import numpy as np
import cv2
import requests
import gradio as gr
from PIL import Image

try:
    import onnxruntime as ort
    _ort_available = True
except Exception:
    _ort_available = False

YOLO_MODEL_URL  = "https://github.com/hjunior29/video-text-remover/raw/main/models/text_detector/best.onnx"
YOLO_MODEL_PATH = "./models/text_detector/best.onnx"
_yolo_session   = None

def _load_yolo():
    global _yolo_session
    if _yolo_session is not None:
        return _yolo_session
    if not _ort_available:
        return None
    os.makedirs(os.path.dirname(YOLO_MODEL_PATH), exist_ok=True)
    if not os.path.exists(YOLO_MODEL_PATH):
        print("Downloading YOLO11 text detector model...")
        r = requests.get(YOLO_MODEL_URL, timeout=120, stream=True)
        r.raise_for_status()
        with open(YOLO_MODEL_PATH, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
    _yolo_session = ort.InferenceSession(YOLO_MODEL_PATH, providers=providers)
    return _yolo_session

def _yolo_preprocess(frame_bgr, input_size=640):
    h, w = frame_bgr.shape[:2]
    scale = input_size / max(h, w)
    nh, nw = int(h * scale), int(w * scale)
    resized = cv2.resize(frame_bgr, (nw, nh))
    padded = np.zeros((input_size, input_size, 3), dtype=np.uint8)
    padded[:nh, :nw] = resized
    rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
    tensor = np.transpose(rgb, (2, 0, 1))[np.newaxis]
    return tensor, scale, nh, nw

def _yolo_postprocess(outputs, scale, orig_h, orig_w, nw, nh, conf_thresh=0.25, margin=5):
    preds = outputs[0]
    if preds.ndim == 3 and preds.shape[1] < preds.shape[2]:
        preds = preds.transpose(0, 2, 1)
    preds = preds[0]
    mask = np.zeros((orig_h, orig_w), dtype=np.uint8)
    for det in preds:
        if det.shape[0] < 5: continue
        conf = float(det[4:].max())
        if conf < conf_thresh: continue
        cx, cy, bw, bh = det[:4]
        x1 = max(0, int((cx - bw/2) / scale) - margin)
        y1 = max(0, int((cy - bh/2) / scale) - margin)
        x2 = min(orig_w, int((cx + bw/2) / scale) + margin)
        y2 = min(orig_h, int((cy + bh/2) / scale) + margin)
        cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
    return mask

def detect_text_mask_yolo(frame_bgr):
    try:
        sess = _load_yolo()
        if sess is None: return None
        tensor, scale, nh, nw = _yolo_preprocess(frame_bgr)
        input_name = sess.get_inputs()[0].name
        outputs = sess.run(None, {input_name: tensor})
        h, w = frame_bgr.shape[:2]
        return _yolo_postprocess(outputs, scale, h, w, nw, nh)
    except Exception as e:
        print(f"YOLO detect error: {e}")
        return None

def detect_caption_bbox_fallback(frame_bgr):
    h, w = frame_bgr.shape[:2]
    gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
    best_box, best_area = None, 0
    for y_start, y_end in [(int(h*0.65), h), (0, int(h*0.12))]:
        if y_end <= y_start: continue
        region = gray[y_start:y_end, :]
        _, thresh = cv2.threshold(region, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        kern = cv2.getStructuringElement(cv2.MORPH_RECT, (w//8, 3))
        closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kern)
        contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        for cnt in contours:
            cx, cy, cw, ch2 = cv2.boundingRect(cnt)
            area = cw * ch2
            if cw < w*0.2 or ch2 < 8 or ch2 > h*0.25: continue
            if area > best_area:
                best_area = area
                pad = 6
                best_box = (max(0, cx-pad), max(0, y_start+cy-pad),
                            min(w, cx+cw+pad*2), min(h, y_start+cy+ch2+pad*2))
    return best_box

def get_video_dims(video_path):
    cap = cv2.VideoCapture(video_path)
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cap.release()
    return w, h

def extract_frame_at(video_path, timestamp_sec):
    if not video_path: return None, "Upload a video first."
    try:
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        total_secs = total_frames / fps
        ts = max(0.0, min(float(timestamp_sec), total_secs - 0.1))
        cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
        ret, frame = cap.read()
        cap.release()
        if not ret:
            cap2 = cv2.VideoCapture(video_path)
            ret, frame = cap2.read()
            cap2.release()
            if not ret: return None, "Could not read frame."
        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)), \
               f"Frame at {ts:.1f}s loaded. Paint over captions then click REMOVE CAPTIONS."
    except Exception as e:
        return None, f"Error: {e}"

def run_opencv_inpaint(abs_input, output_path, mask_img, method='hybrid'):
    try:
        vid_w, vid_h = get_video_dims(abs_input)
        temp_vid = output_path + "_tmp.mp4"
        temp_aud = output_path + "_tmp.aac"
        subprocess.run(["ffmpeg", "-y", "-i", abs_input, "-vn", "-c:a", "copy", temp_aud],
                       stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        cap = cv2.VideoCapture(abs_input)
        fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = cv2.VideoWriter(temp_vid, fourcc, fps, (vid_w, vid_h))
        mask = cv2.resize(mask_img, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST)
        frame_idx = 0
        t_start = time.time()
        while True:
            ret, frame = cap.read()
            if not ret: break
            if method == 'hybrid':
                kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
                expanded = cv2.dilate(mask, kernel, iterations=1)
                inpainted = cv2.inpaint(frame, expanded, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
            elif method == 'ns':
                inpainted = cv2.inpaint(frame, mask, inpaintRadius=3, flags=cv2.INPAINT_NS)
            elif method == 'blur':
                inpainted = frame.copy()
                blurred = cv2.GaussianBlur(frame, (51, 51), 30)
                inpainted[mask == 255] = blurred[mask == 255]
            elif method == 'black':
                inpainted = frame.copy()
                inpainted[mask == 255] = 0
            elif method == 'bg':
                inpainted = frame.copy()
                kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))
                border = cv2.dilate(mask, kernel2) - mask
                mean_color = cv2.mean(frame, mask=border)[:3]
                inpainted[mask == 255] = [int(mean_color[0]), int(mean_color[1]), int(mean_color[2])]
            else:
                inpainted = cv2.inpaint(frame, mask, inpaintRadius=2, flags=cv2.INPAINT_TELEA)
            out.write(inpainted)
            frame_idx += 1
        elapsed = time.time() - t_start
        fps_rate = frame_idx / max(elapsed, 0.1)
        cap.release()
        out.release()
        has_audio = os.path.exists(temp_aud) and os.path.getsize(temp_aud) > 0
        if has_audio:
            subprocess.run(["ffmpeg", "-y", "-i", temp_vid, "-i", temp_aud,
                            "-c:v", "libx264", "-preset", "fast", "-crf", "18",
                            "-c:a", "aac", "-b:a", "192k", "-movflags", "+faststart",
                            output_path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        else:
            subprocess.run(["ffmpeg", "-y", "-i", temp_vid,
                            "-c:v", "libx264", "-preset", "fast", "-crf", "18",
                            "-movflags", "+faststart", output_path],
                           check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        for f in [temp_vid, temp_aud]:
            if os.path.exists(f): os.remove(f)
        return True, f"{frame_idx} frames in {elapsed:.1f}s ({fps_rate:.1f} fps)"
    except Exception as e:
        import traceback
        return False, traceback.format_exc()

def auto_preview(video_path, timestamp_sec=3.0):
    if not video_path: return None, "Upload a video first."
    try:
        ts = max(0.0, float(timestamp_sec))
        cap = cv2.VideoCapture(video_path)
        cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
        ret, frame = cap.read()
        cap.release()
        if not ret: return None, "Could not read frame."
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        h, w = frame.shape[:2]
        mask = detect_text_mask_yolo(frame)
        used = "YOLO11"
        if mask is None or mask.max() == 0:
            box = detect_caption_bbox_fallback(frame)
            used = "OpenCV fallback"
            if box is None:
                return Image.fromarray(rgb), "No captions detected. Try Manual mode or adjust timestamp."
            mask = np.zeros((h, w), dtype=np.uint8)
            x1, y1, x2, y2 = box
            cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
        preview = rgb.copy()
        overlay = rgb.copy()
        overlay[mask == 255] = [220, 40, 40]
        blended = cv2.addWeighted(preview, 0.5, overlay, 0.5, 0)
        n_pixels = int(mask.sum() / 255)
        return Image.fromarray(blended), f"{n_pixels} text pixels found via {used}. Red = will be removed."
    except Exception as e:
        return None, f"Error: {e}"

def auto_remove(video_path, timestamp_sec=3.0, inpaint_method='hybrid', save_folder="./Caption-Removed/"):
    if not video_path: return None, "No video uploaded."
    os.makedirs(save_folder, exist_ok=True)
    abs_input = os.path.abspath(video_path)
    base_name = os.path.splitext(os.path.basename(video_path))[0]
    output_path = os.path.abspath(os.path.join(save_folder, f"{base_name}_NO_CAPTIONS.mp4"))
    try:
        cap = cv2.VideoCapture(abs_input)
        cap.set(cv2.CAP_PROP_POS_MSEC, max(0.0, float(timestamp_sec)) * 1000)
        ret, frame = cap.read()
        cap.release()
        if not ret: return None, "Could not read frame."
        vid_w, vid_h = get_video_dims(abs_input)
        mask = detect_text_mask_yolo(frame)
        used = "YOLO11"
        if mask is None or mask.max() == 0:
            box = detect_caption_bbox_fallback(frame)
            used = "OpenCV fallback"
            if box is None: return None, "No captions detected. Try Manual mode."
            mask = np.zeros((vid_h, vid_w), dtype=np.uint8)
            x1, y1, x2, y2 = box
            cv2.rectangle(mask, (x1, y1), (x2, y2), 255, -1)
        else:
            mask = cv2.resize(mask, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST)
        ok, info = run_opencv_inpaint(abs_input, output_path, mask, method=inpaint_method)
        if not ok: return None, f"Inpaint failed:\n{info}"
        size_mb = os.path.getsize(output_path) / (1024 * 1024)
        return output_path, f"Done!\nSIZE ▸ {size_mb:.1f} MB\nMETHOD ▸ {used} + {inpaint_method.upper()}\nTIME ▸ {info}"
    except Exception as e:
        import traceback
        return None, f"Error:\n{traceback.format_exc()}"

def manual_extract_frame(video_path, timestamp_sec):
    return extract_frame_at(video_path, timestamp_sec)

def manual_remove(video_path, editor_value, inpaint_method='hybrid', save_folder="./Caption-Removed/"):
    if not video_path: return None, "No video uploaded."
    if editor_value is None: return None, "Extract frame and paint over captions first."
    os.makedirs(save_folder, exist_ok=True)
    abs_input = os.path.abspath(video_path)
    base_name = os.path.splitext(os.path.basename(video_path))[0]
    output_path = os.path.abspath(os.path.join(save_folder, f"{base_name}_NO_CAPTIONS.mp4"))
    try:
        vid_w, vid_h = get_video_dims(abs_input)
        layers    = editor_value.get("layers", [])
        composite = editor_value.get("composite")
        background = editor_value.get("background")
        mask = np.zeros((vid_h, vid_w), dtype=np.uint8)
        if layers:
            for layer in layers:
                if layer is None: continue
                lnp = np.array(layer.convert("RGBA"))
                lnp = cv2.resize(lnp, (vid_w, vid_h), interpolation=cv2.INTER_NEAREST)
                mask[lnp[:, :, 3] > 10] = 255
        if mask.max() == 0 and composite is not None and background is not None:
            cnp = cv2.resize(np.array(composite.convert("RGB")), (vid_w, vid_h))
            bnp = cv2.resize(np.array(background.convert("RGB")), (vid_w, vid_h))
            diff = cv2.cvtColor(cv2.absdiff(cnp, bnp), cv2.COLOR_RGB2GRAY)
            mask[diff > 15] = 255
        if mask.max() == 0: return None, "No strokes detected. Paint over the captions."
        ok, info = run_opencv_inpaint(abs_input, output_path, mask, method=inpaint_method)
        if not ok: return None, f"Inpaint failed:\n{info}"
        size_mb = os.path.getsize(output_path) / (1024 * 1024)
        return output_path, f"Done!\nSIZE ▸ {size_mb:.1f} MB\nMETHOD ▸ Manual + {inpaint_method.upper()}\nTIME ▸ {info}"
    except Exception as e:
        import traceback
        return None, f"Error:\n{traceback.format_exc()}"

def build_tab():
    with gr.Tab("🖌️  CAPTION  REMOVER"):
        gr.HTML("""<div style="padding:16px 4px 6px;">
            <div style="font-family:'Orbitron',sans-serif;font-size:.65rem;font-weight:700;color:#00d4ff;letter-spacing:.2em;text-transform:uppercase;margin-bottom:6px;">YOLO11 + OpenCV Inpainting</div>
            <div style="font-family:'Share Tech Mono',monospace;font-size:.75rem;color:#2a5570;line-height:1.9;">
                Upload video, scrub to where captions appear, use Auto or Manual.<br>
                <span style="color:#ff3c6e;">Complex backgrounds may still show minor artifacts on CPU.</span>
            </div></div>""")
        with gr.Row():
            with gr.Column(scale=1):
                cap_video_upload = gr.File(label="Upload Video", file_types=[".mp4",".mov",".mkv",".avi",".webm"])
            with gr.Column(scale=2):
                cap_video_player = gr.Video(label="Preview - scrub to find captions", interactive=False, height=300)
        cap_video_upload.change(fn=lambda f: f, inputs=[cap_video_upload], outputs=[cap_video_player])
        cap_timestamp = gr.Slider(label="Timestamp to sample caption detection (seconds)", minimum=0, maximum=300, value=3, step=0.5)
        gr.HTML("<div style='height:1px;background:#0d2137;margin:8px 0;'></div>")
        with gr.Tabs():
            with gr.Tab("🤖  AUTO DETECT"):
                with gr.Row(equal_height=True):
                    with gr.Column(scale=1):
                        auto_method      = gr.Dropdown(label="Inpaint Method", choices=["hybrid","telea","ns","blur","black","bg"], value="hybrid")
                        auto_preview_btn = gr.Button("PREVIEW DETECTION", variant="secondary", size="lg")
                        auto_remove_btn  = gr.Button("REMOVE CAPTIONS", variant="primary", size="lg")
                        auto_log         = gr.Textbox(label="System Log", interactive=False, lines=4)
                        auto_out         = gr.File(label="Download Result")
                    with gr.Column(scale=1):
                        auto_preview_img = gr.Image(label="Detection Preview - red = pixels to be removed", type="pil", interactive=False, height=360)
                auto_preview_btn.click(fn=auto_preview, inputs=[cap_video_upload, cap_timestamp], outputs=[auto_preview_img, auto_log])
                auto_remove_btn.click(fn=auto_remove, inputs=[cap_video_upload, cap_timestamp, auto_method], outputs=[auto_out, auto_log])
            with gr.Tab("✏️  MANUAL  PAINT"):
                with gr.Row(equal_height=False):
                    with gr.Column(scale=1):
                        manual_method      = gr.Dropdown(label="Inpaint Method", choices=["hybrid","telea","ns","blur","black","bg"], value="hybrid")
                        manual_extract_btn = gr.Button("EXTRACT FRAME", variant="secondary", size="lg")
                        manual_remove_btn  = gr.Button("REMOVE CAPTIONS", variant="primary", size="lg")
                        manual_log         = gr.Textbox(label="System Log", interactive=False, lines=4)
                        manual_out         = gr.File(label="Download Result")
                    with gr.Column(scale=2):
                        manual_editor = gr.ImageEditor(label="Paint over the caption text", type="pil", height=420,
                            brush=gr.Brush(colors=["#ff0000","#ffffff"], color_mode="fixed", default_size=10),
                            eraser=gr.Eraser(default_size=24), layers=False, interactive=True)
                manual_extract_btn.click(fn=manual_extract_frame, inputs=[cap_video_upload, cap_timestamp], outputs=[manual_editor, manual_log])
                manual_remove_btn.click(fn=manual_remove, inputs=[cap_video_upload, manual_editor, manual_method], outputs=[manual_out, manual_log])