import cv2 import torch from transformers import DetrImageProcessor, DetrForObjectDetection, TrOCRProcessor, VisionEncoderDecoderModel from PIL import Image from datetime import datetime # Ensure all required libraries are installed try: import timm # Required by DETR except ImportError: raise ImportError("The 'timm' library is required but not installed. Install it using 'pip install timm'.") # Load the DETR model for object detection (license plate detection) try: detr_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") detr_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") except Exception as e: raise RuntimeError(f"Error initializing DETR model: {e}") # Load the TrOCR model for OCR (license plate text recognition) try: trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") except Exception as e: raise RuntimeError(f"Error initializing TrOCR model: {e}") def detect_license_plate(frame): """ Detect license plates in a video frame using DETR. """ # Convert the frame to a PIL image pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # Preprocess the image for DETR inputs = detr_processor(images=pil_image, return_tensors="pt") outputs = detr_model(**inputs) # Get detected objects and filter for license plates logits = outputs.logits boxes = outputs.pred_boxes probas = logits.softmax(-1)[0, :, :-1] keep = probas.max(-1).values > 0.9 # Confidence threshold detected_boxes = [] for box, score in zip(boxes[keep], probas[keep]): # Convert box coordinates to pixel values box = box.detach().cpu().numpy() detected_boxes.append(box) return detected_boxes def recognize_text(plate_image): """ Recognize text from a license plate image using TrOCR. """ # Convert the license plate image to a PIL image pil_image = Image.fromarray(cv2.cvtColor(plate_image, cv2.COLOR_BGR2RGB)) # Preprocess the image for TrOCR pixel_values = trocr_processor(images=pil_image, return_tensors="pt").pixel_values generated_ids = trocr_model.generate(pixel_values) text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return text.strip() def process_video(video_path, frame_skip=5): """ Process a video to detect license plates and log entry/exit times. """ cap = cv2.VideoCapture(video_path) vehicle_data = {} frame_count = 0 while cap.isOpened(): ret, frame = cap.read() if not ret: break frame_count += 1 if frame_count % frame_skip != 0: continue # Skip frames to optimize processing time # Detect license plates detected_boxes = detect_license_plate(frame) for box in detected_boxes: x_min, y_min, x_max, y_max = map(int, box) license_plate_image = frame[y_min:y_max, x_min:x_max] # Recognize text from the license plate license_plate = recognize_text(license_plate_image) if license_plate: current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if license_plate not in vehicle_data: # Vehicle entering vehicle_data[license_plate] = {'entry_time': current_time, 'exit_time': None} print(f"Vehicle {license_plate} entered at {current_time}") else: # Update exit time vehicle_data[license_plate]['exit_time'] = current_time # Draw bounding box and license plate text cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2) cv2.putText(frame, license_plate, (x_min, y_min-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2) # Display the frame (optional, can be removed for headless environments) cv2.imshow('Vehicle Detection', frame) # Break on 'q' key press if cv2.waitKey(1) & 0xFF == ord('q'): break cap.release() cv2.destroyAllWindows() # Print vehicle data print("\nVehicle Data:") for plate, times in vehicle_data.items(): print(f"License Plate: {plate}, Entry Time: {times['entry_time']}, Exit Time: {times['exit_time']}") if __name__ == "__main__": # Replace 'road_video.mp4' with the path to your video file or use 0 for webcam process_video("road_video.mp4", frame_skip=5)