import os from moviepy import ImageSequenceClip from natsort import natsorted from transformers import pipeline from transformers.image_utils import load_image from PIL import ImageDraw, Image, ImageFont from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection import torch import cv2 import os def create_video_from_images(folder_path, output_video_file, fps): """ Creates a video file from a sequence of images in a folder. Args: folder_path (str): The path to the folder containing the images. output_video_file (str): The name of the output video file (e.g., 'my_video.mp4'). fps (int): The frames per second for the output video. """ if not os.path.isdir(folder_path): print(f"Error: The folder '{folder_path}' does not exist.") return # List all image files in the folder. # We use natsorted to ensure files with numerical names (e.g., image-1.png, image-10.png) # are sorted in a human-friendly way. supported_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif') image_files = [ os.path.join(folder_path, f) for f in natsorted(os.listdir(folder_path)) if f.lower().endswith(supported_extensions) ] if not image_files: print(f"Error: No supported image files found in '{folder_path}'.") return if len(image_files) < 2: print("Error: At least two images are required to create a video.") return print(f"Found {len(image_files)} images. Creating video...") try: # Create a video clip from the list of image files. clip = ImageSequenceClip(image_files, fps=fps) # Write the video file to the specified path. clip.write_videofile(output_video_file, fps=fps) print(f"Successfully created video: '{output_video_file}'") except Exception as e: print(f"An error occurred while creating the video: {e}") def object_detection(path_video, output_folder, config): device = "cuda" if torch.cuda.is_available() else "cpu" text_labels = config.get('labels', []) frame_color = config.get('frame_colour') checkpoint = "iSEE-Laboratory/llmdet_tiny" #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det" model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto") processor = AutoProcessor.from_pretrained(checkpoint) # Initialize video capture vidcap = cv2.VideoCapture(path_video) frame_count = 0 # Initialize hand tracking while vidcap.isOpened(): ret, frame = vidcap.read() if not ret: break print(f"Processing frame {frame_count}") # Convert the BGR image to RGB and ensure RGB mode rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) image = Image.fromarray(rgb_frame).convert("RGB") inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) # monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height) results = processor.post_process_grounded_object_detection( outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0] draw = ImageDraw.Draw(image) scores = results.get("scores", []) text_labels_res = results.get("text_labels", []) boxes = results.get("boxes", []) for box, score, text_label in zip(boxes, scores, text_labels_res): xmin, ymin, xmax, ymax = box draw.rectangle((xmin, ymin, xmax, ymax), outline=frame_color, width=10) # convert score to float safely try: score_val = float(score) except Exception: score_val = round(score.item(), 2) # font_size = max(10, int(0.1 * image.height)) # 10% of image height, minimum 10 pixels #font = ImageFont.load_default(size=80) font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60) draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font) # save the annotated image (PIL image is modified in-place) image.save(f"{output_folder}/{frame_count}.png") # Exit loop by pressing 'q' if cv2.waitKey(1) & 0xFF == ord('q'): break frame_count += 1 if frame_count == 90: # limit to first 30 frames break # Release the video capture and close windows vidcap.release() cv2.destroyAllWindows()