Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import cv2 | |
| import yt_dlp | |
| import os | |
| import torch | |
| import numpy as np | |
| from PIL import Image | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| from sentence_transformers import SentenceTransformer | |
| import tempfile | |
| import faiss | |
| from deepface import DeepFace | |
| from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast | |
| detectors = ["opencv", "ssd", "mtcnn", "dlib", "retinaface"] | |
| MODEL_ID = "openai/clip-vit-base-patch32" | |
| tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID) | |
| model = CLIPModel.from_pretrained(MODEL_ID) | |
| processor = CLIPProcessor.from_pretrained(MODEL_ID) | |
| caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
| def embed_func(query): | |
| inputs = tokenizer([query], padding=True, return_tensors="pt") | |
| text_features = model.get_text_features(**inputs) | |
| return text_features.detach().numpy()[0] | |
| def download_youtube_video(youtube_url): | |
| temp_dir = tempfile.mkdtemp() | |
| ydl_opts = { | |
| 'format': 'best', | |
| 'outtmpl': os.path.join(temp_dir, '%(title)s.%(ext)s'), | |
| } | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| info_dict = ydl.extract_info(youtube_url, download=True) | |
| return ydl.prepare_filename(info_dict) | |
| def extract_unique_frames(video_path, interval_sec=1): | |
| cap = cv2.VideoCapture(video_path) | |
| fps = cap.get(cv2.CAP_PROP_FPS) | |
| frames = [] | |
| count = 0 | |
| while cap.isOpened(): | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| if int(cap.get(cv2.CAP_PROP_POS_MSEC)) % (interval_sec * 1000) < 50: | |
| rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| img_pil = Image.fromarray(rgb) | |
| frames.append(img_pil) | |
| count += 1 | |
| cap.release() | |
| return frames | |
| def caption_image(image: Image.Image): | |
| inputs = caption_processor(images=image, return_tensors="pt") | |
| out = caption_model.generate(**inputs) | |
| caption = caption_processor.decode(out[0], skip_special_tokens=True) | |
| return caption | |
| def build_vector_store(embed): | |
| dim = embed.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(embed) | |
| return index | |
| def search_query(text, index): | |
| query_vec = embed_func(text).reshape(1, -1) | |
| print('search query vector Done', query_vec.shape) | |
| D, I = index.search(np.array(query_vec).astype('float32'), k=1) | |
| print('in search query', D, I) | |
| return I[0][0] | |
| # def face_crop(image): | |
| # img = DeepFace.extract_faces(image, detector_backend = detectors[4]) | |
| # return img | |
| def face_crop(image_pil): | |
| # Convert PIL.Image to OpenCV BGR format | |
| img_rgb = np.array(image_pil) | |
| img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR) | |
| # print('face crop: ', img_bgr.shape, img_bgr) | |
| # Now pass to DeepFace | |
| try: | |
| img = DeepFace.extract_faces(img_bgr, detector_backend=detectors[2]) | |
| return img | |
| except: | |
| pass |