import gradio as gr
import cv2
import yt_dlp
import os
import torch
import numpy as np
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer
import tempfile
import faiss
from deepface import DeepFace
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast


detectors = ["opencv", "ssd", "mtcnn", "dlib", "retinaface"]

MODEL_ID = "openai/clip-vit-base-patch32"

tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID)
model = CLIPModel.from_pretrained(MODEL_ID)
processor = CLIPProcessor.from_pretrained(MODEL_ID)

caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


def embed_func(query):
    inputs = tokenizer([query], padding=True, return_tensors="pt")
    text_features = model.get_text_features(**inputs)
    return text_features.detach().numpy()[0]

def download_youtube_video(youtube_url):
    temp_dir = tempfile.mkdtemp()
    ydl_opts = {
        'format': 'best',
        'outtmpl': os.path.join(temp_dir, '%(title)s.%(ext)s'),
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(youtube_url, download=True)
        return ydl.prepare_filename(info_dict)

def extract_unique_frames(video_path, interval_sec=1):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frames = []
    count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if int(cap.get(cv2.CAP_PROP_POS_MSEC)) % (interval_sec * 1000) < 50:
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img_pil = Image.fromarray(rgb)
            frames.append(img_pil)
        count += 1
    cap.release()
    return frames

def caption_image(image: Image.Image):
    inputs = caption_processor(images=image, return_tensors="pt")
    out = caption_model.generate(**inputs)
    caption = caption_processor.decode(out[0], skip_special_tokens=True)
    return caption

def build_vector_store(embed):
    dim = embed.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embed)
    return index

def search_query(text, index):
    query_vec = embed_func(text).reshape(1, -1) 
    print('search query vector Done', query_vec.shape)
    D, I = index.search(np.array(query_vec).astype('float32'), k=1)
    print('in search query', D, I)
    return I[0][0]

# def face_crop(image):
#     img = DeepFace.extract_faces(image, detector_backend = detectors[4])
#     return img

def face_crop(image_pil):
    # Convert PIL.Image to OpenCV BGR format
    img_rgb = np.array(image_pil)
    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
    # print('face crop: ', img_bgr.shape, img_bgr)
    
    # Now pass to DeepFace
    try:
        img = DeepFace.extract_faces(img_bgr, detector_backend=detectors[2])
        return img
    except:
        pass