import cv2
import torch
import os
import numpy as np
from torchvision import transforms

def process_video(video_path, model, output_path, device):
    transform = transforms.Compose([
        transforms.ToTensor()
    ])

    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise IOError(f"Cannot open video: {video_path}")

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_skip = 2 
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        resized_frame = cv2.resize(frame, (640, 480))

        if frame_count % frame_skip == 0:
            rgb_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
            img_tensor = transform(rgb_frame).to(device)

            with torch.no_grad():
                prediction = model([img_tensor])[0]

            for box, score, label in zip(prediction["boxes"], prediction["scores"], prediction["labels"]):
                if score > 0.5:
                    x1, y1, x2, y2 = map(int, box)
                    cv2.rectangle(resized_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    cv2.putText(resized_frame, f"{label.item()}:{score:.2f}", (x1, y1 - 10),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        output_frame = cv2.resize(resized_frame, (width, height))
        out.write(output_frame)

        frame_count += 1

    cap.release()
    out.release()
    return output_path