Asarkar07's picture
Update method.py
8be4809 verified
import gradio as gr
import cv2
import yt_dlp
import os
import torch
import numpy as np
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer
import tempfile
import faiss
from deepface import DeepFace
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast
detectors = ["opencv", "ssd", "mtcnn", "dlib", "retinaface"]
MODEL_ID = "openai/clip-vit-base-patch32"
tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID)
model = CLIPModel.from_pretrained(MODEL_ID)
processor = CLIPProcessor.from_pretrained(MODEL_ID)
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
def embed_func(query):
inputs = tokenizer([query], padding=True, return_tensors="pt")
text_features = model.get_text_features(**inputs)
return text_features.detach().numpy()[0]
def download_youtube_video(youtube_url):
temp_dir = tempfile.mkdtemp()
ydl_opts = {
'format': 'best',
'outtmpl': os.path.join(temp_dir, '%(title)s.%(ext)s'),
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(youtube_url, download=True)
return ydl.prepare_filename(info_dict)
def extract_unique_frames(video_path, interval_sec=1):
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frames = []
count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if int(cap.get(cv2.CAP_PROP_POS_MSEC)) % (interval_sec * 1000) < 50:
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img_pil = Image.fromarray(rgb)
frames.append(img_pil)
count += 1
cap.release()
return frames
def caption_image(image: Image.Image):
inputs = caption_processor(images=image, return_tensors="pt")
out = caption_model.generate(**inputs)
caption = caption_processor.decode(out[0], skip_special_tokens=True)
return caption
def build_vector_store(embed):
dim = embed.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embed)
return index
def search_query(text, index):
query_vec = embed_func(text).reshape(1, -1)
print('search query vector Done', query_vec.shape)
D, I = index.search(np.array(query_vec).astype('float32'), k=1)
print('in search query', D, I)
return I[0][0]
# def face_crop(image):
# img = DeepFace.extract_faces(image, detector_backend = detectors[4])
# return img
def face_crop(image_pil):
# Convert PIL.Image to OpenCV BGR format
img_rgb = np.array(image_pil)
img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
# print('face crop: ', img_bgr.shape, img_bgr)
# Now pass to DeepFace
try:
img = DeepFace.extract_faces(img_bgr, detector_backend=detectors[2])
return img
except:
pass