Spaces:

engajify
/

object_video

Runtime error

App Files Files Community

object_video / app.py

engajify

Upload 4 files

1882592 verified over 1 year ago

raw

history blame contribute delete

6.08 kB

	import gradio as gr
	import torch
	import numpy as np
	from transformers import OwlViTProcessor, OwlViTForObjectDetection, ResNetModel
	from torchvision import transforms
	from PIL import Image
	import cv2
	import torch.nn.functional as F
	import tempfile
	import os

	# Load models
	resnet = ResNetModel.from_pretrained("microsoft/resnet-50")
	resnet.eval()
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	resnet = resnet.to(device)

	mixin = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
	processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
	model = mixin.to(device)

	# Preprocess the image
	def preprocess_image(image):
	transform = transforms.Compose([
	transforms.Resize((224, 224)),
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
	])
	return transform(image).unsqueeze(0)

	def extract_embedding(image):
	image_tensor = preprocess_image(image).to(device)
	with torch.no_grad():
	output = resnet(image_tensor)
	embedding = output.pooler_output
	return embedding

	def cosine_similarity(embedding1, embedding2):
	return F.cosine_similarity(embedding1, embedding2)

	def l2_distance(embedding1, embedding2):
	return torch.norm(embedding1 - embedding2, p=2)

	def save_array_to_temp_image(arr):
	rgb_arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
	img = Image.fromarray(rgb_arr)
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
	temp_file_name = temp_file.name
	temp_file.close()
	img.save(temp_file_name)
	return temp_file_name

	def detect_and_crop(target_image, query_image, threshold=0.6, nms_threshold=0.3):
	target_sizes = torch.Tensor([target_image.size[::-1]])
	inputs = processor(images=target_image, query_images=query_image, return_tensors="pt").to(device)
	with torch.no_grad():
	outputs = model.image_guided_detection(**inputs)

	img = cv2.cvtColor(np.array(target_image), cv2.COLOR_BGR2RGB)
	outputs.logits = outputs.logits.cpu()
	outputs.target_pred_boxes = outputs.target_pred_boxes.cpu()

	results = processor.post_process_image_guided_detection(outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes)
	boxes, scores = results[0]["boxes"], results[0]["scores"]

	if len(boxes) == 0:
	return []

	filtered_boxes = []
	for box in boxes:
	x1, y1, x2, y2 = [int(i) for i in box.tolist()]
	cropped_img = img[y1:y2, x1:x2]
	if cropped_img.size != 0:
	filtered_boxes.append(cropped_img)

	return filtered_boxes

	def process_video(video_path, query_image, skipframes=0):
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	return

	frame_count = 0
	all_results = []
	while True:
	ret, frame = cap.read()
	if not ret:
	break
	if frame_count % (skipframes + 1) == 0:
	frame_file = save_array_to_temp_image(frame)
	result_frames = detect_and_crop(Image.open(frame_file), query_image)
	for res in result_frames:
	saved_res = save_array_to_temp_image(res)
	embedding1 = extract_embedding(query_image)
	embedding2 = extract_embedding(Image.open(saved_res))
	dist = l2_distance(embedding1, embedding2).item()
	cos = cosine_similarity(embedding1, embedding2).item()
	all_results.append({'l2_dist': dist, 'cos': cos})
	frame_count += 1
	cap.release()
	return all_results

	def process_videos_and_compare(image, video, skipframes=5, threshold=0.47):
	def median(values):
	n = len(values)
	return (values[n // 2 - 1] + values[n // 2]) / 2 if n % 2 == 0 else values[n // 2]

	results = process_video(video, image, skipframes)
	if results:
	l2_dists = [item['l2_dist'] for item in results]
	cosines = [item['cos'] for item in results]
	avg_l2_dist = sum(l2_dists) / len(l2_dists)
	avg_cos = sum(cosines) / len(cosines)
	median_l2_dist = median(sorted(l2_dists))
	median_cos = median(sorted(cosines))
	result = {
	"avg_l2_dist": avg_l2_dist,
	"avg_cos": avg_cos,
	"median_l2_dist": median_l2_dist,
	"median_cos": median_cos,
	"avg_cos_dist": 1 - avg_cos,
	"median_cos_dist": 1 - median_cos,
	"is_present": avg_cos >= threshold
	}
	else:
	result = {
	"avg_l2_dist": float('inf'),
	"avg_cos": 0,
	"median_l2_dist": float('inf'),
	"median_cos": 0,
	"avg_cos_dist": float('inf'),
	"median_cos_dist": float('inf'),
	"is_present": False
	}
	return result

	def interface(video, image, skipframes, threshold):
	result = process_videos_and_compare(image, video, skipframes, threshold)
	return result

	iface = gr.Interface(
	fn=interface,
	inputs=[
	gr.Video(label="Upload a Video"),
	gr.Image(type="pil", label="Upload a Query Image"),
	gr.Slider(minimum=0, maximum=10, step=1, value=5, label="Skip Frames"),
	gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.47, label="Threshold")
	],
	outputs=[
	gr.JSON(label="Result")
	],
	title="Object Detection in Video",
	description="""
	Instructions:

	1. Upload a Video: Select a video file to upload.
	2. Upload a Query Image: Select an image file that contains the object you want to detect in the video.
	3. Set Skip Frames: Adjust the slider to set the number of frames to skip between each processing.
	4. Set Threshold: Adjust the slider to set the threshold for cosine similarity to determine if the object is present in the video.
	5. View Results: The result will show the average and median distances and similarities, and whether the object is present in the video based on the threshold.
	"""
	)

	if __name__ == "__main__":
	iface.launch()