import gradio as gr import cv2 import yt_dlp import os import torch import numpy as np from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration from sentence_transformers import SentenceTransformer import tempfile import faiss from deepface import DeepFace from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast detectors = ["opencv", "ssd", "mtcnn", "dlib", "retinaface"] MODEL_ID = "openai/clip-vit-base-patch32" tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID) model = CLIPModel.from_pretrained(MODEL_ID) processor = CLIPProcessor.from_pretrained(MODEL_ID) caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") def embed_func(query): inputs = tokenizer([query], padding=True, return_tensors="pt") text_features = model.get_text_features(**inputs) return text_features.detach().numpy()[0] def download_youtube_video(youtube_url): temp_dir = tempfile.mkdtemp() ydl_opts = { 'format': 'best', 'outtmpl': os.path.join(temp_dir, '%(title)s.%(ext)s'), } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(youtube_url, download=True) return ydl.prepare_filename(info_dict) def extract_unique_frames(video_path, interval_sec=1): cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) frames = [] count = 0 while cap.isOpened(): ret, frame = cap.read() if not ret: break if int(cap.get(cv2.CAP_PROP_POS_MSEC)) % (interval_sec * 1000) < 50: rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) img_pil = Image.fromarray(rgb) frames.append(img_pil) count += 1 cap.release() return frames def caption_image(image: Image.Image): inputs = caption_processor(images=image, return_tensors="pt") out = caption_model.generate(**inputs) caption = caption_processor.decode(out[0], skip_special_tokens=True) return caption def build_vector_store(embed): dim = embed.shape[1] index = faiss.IndexFlatL2(dim) index.add(embed) return index def search_query(text, index): query_vec = embed_func(text).reshape(1, -1) print('search query vector Done', query_vec.shape) D, I = index.search(np.array(query_vec).astype('float32'), k=1) print('in search query', D, I) return I[0][0] # def face_crop(image): # img = DeepFace.extract_faces(image, detector_backend = detectors[4]) # return img def face_crop(image_pil): # Convert PIL.Image to OpenCV BGR format img_rgb = np.array(image_pil) img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR) # print('face crop: ', img_bgr.shape, img_bgr) # Now pass to DeepFace try: img = DeepFace.extract_faces(img_bgr, detector_backend=detectors[2]) return img except: pass