Spaces:

themalinery
/

object_detection

Sleeping

App Files Files Community

object_detection / src /utils.py

themalinery

web app

8023e2e 14 days ago

raw

history blame contribute delete

4.67 kB

	import os
	from moviepy import ImageSequenceClip
	from natsort import natsorted
	from transformers import pipeline
	from transformers.image_utils import load_image
	from PIL import ImageDraw, Image, ImageFont
	from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
	import torch
	import cv2
	import os

	def create_video_from_images(folder_path, output_video_file, fps):
	"""
	Creates a video file from a sequence of images in a folder.

	Args:
	folder_path (str): The path to the folder containing the images.
	output_video_file (str): The name of the output video file (e.g., 'my_video.mp4').
	fps (int): The frames per second for the output video.
	"""
	if not os.path.isdir(folder_path):
	print(f"Error: The folder '{folder_path}' does not exist.")
	return

	# List all image files in the folder.
	# We use natsorted to ensure files with numerical names (e.g., image-1.png, image-10.png)
	# are sorted in a human-friendly way.
	supported_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')
	image_files = [
	os.path.join(folder_path, f)
	for f in natsorted(os.listdir(folder_path))
	if f.lower().endswith(supported_extensions)
	]

	if not image_files:
	print(f"Error: No supported image files found in '{folder_path}'.")
	return

	if len(image_files) < 2:
	print("Error: At least two images are required to create a video.")
	return

	print(f"Found {len(image_files)} images. Creating video...")

	try:
	# Create a video clip from the list of image files.
	clip = ImageSequenceClip(image_files, fps=fps)

	# Write the video file to the specified path.
	clip.write_videofile(output_video_file, fps=fps)

	print(f"Successfully created video: '{output_video_file}'")
	except Exception as e:
	print(f"An error occurred while creating the video: {e}")


	def object_detection(path_video, output_folder, config):

	device = "cuda" if torch.cuda.is_available() else "cpu"
	text_labels = config.get('labels', [])
	frame_color = config.get('frame_colour')

	checkpoint = "iSEE-Laboratory/llmdet_tiny" #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
	model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto")
	processor = AutoProcessor.from_pretrained(checkpoint)

	# Initialize video capture
	vidcap = cv2.VideoCapture(path_video)

	frame_count = 0
	# Initialize hand tracking
	while vidcap.isOpened():
	ret, frame = vidcap.read()
	if not ret:
	break

	print(f"Processing frame {frame_count}")

	# Convert the BGR image to RGB and ensure RGB mode
	rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	image = Image.fromarray(rgb_frame).convert("RGB")

	inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device)

	with torch.no_grad():
	outputs = model(**inputs)

	# monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height)


	results = processor.post_process_grounded_object_detection(
	outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0]

	draw = ImageDraw.Draw(image)

	scores = results.get("scores", [])
	text_labels_res = results.get("text_labels", [])
	boxes = results.get("boxes", [])

	for box, score, text_label in zip(boxes, scores, text_labels_res):
	xmin, ymin, xmax, ymax = box
	draw.rectangle((xmin, ymin, xmax, ymax), outline=frame_color, width=10)
	# convert score to float safely
	try:
	score_val = float(score)
	except Exception:
	score_val = round(score.item(), 2)

	# font_size = max(10, int(0.1 * image.height)) # 10% of image height, minimum 10 pixels
	#font = ImageFont.load_default(size=80)
	font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60)
	draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font)
	# save the annotated image (PIL image is modified in-place)
	image.save(f"{output_folder}/{frame_count}.png")

	# Exit loop by pressing 'q'
	if cv2.waitKey(1) & 0xFF == ord('q'):
	break

	frame_count += 1
	if frame_count == 90: # limit to first 30 frames
	break

	# Release the video capture and close windows
	vidcap.release()
	cv2.destroyAllWindows()