themalinery commited on
Commit
8023e2e
·
1 Parent(s): 754ab40
Files changed (6) hide show
  1. app.py +80 -4
  2. config.yaml +3 -13
  3. main.py +2 -81
  4. pyproject.toml +2 -1
  5. src/utils.py +80 -1
  6. uv.lock +0 -0
app.py CHANGED
@@ -1,7 +1,83 @@
1
  import gradio as gr
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import tempfile
3
+ import shutil
4
+ from pathlib import Path
5
 
6
+ from src.utils import create_video_from_images, object_detection
 
7
 
8
+ def process_video(video_file, labels_text, frame_color):
9
+ # Parse labels
10
+ text_labels = [label.strip() for label in labels_text.split(',') if label.strip()]
11
+
12
+ if not text_labels:
13
+ raise gr.Error("Please enter at least one label")
14
+
15
+ # Create config
16
+ config = {
17
+ 'labels': text_labels,
18
+ 'frame_colour': frame_color
19
+ }
20
+
21
+ # Create temporary directories
22
+ with tempfile.TemporaryDirectory() as temp_dir:
23
+ temp_path = Path(temp_dir)
24
+ frames_dir = temp_path / "frames"
25
+ frames_dir.mkdir()
26
+ output_video = temp_path / "output.mp4"
27
+
28
+ # Process video to frames
29
+ object_detection(str(video_file), str(frames_dir), config)
30
+
31
+ # Create video from frames
32
+ create_video_from_images(str(frames_dir), str(output_video), fps=30)
33
+
34
+ # Copy to a permanent location for download
35
+ results_dir = Path("./results/gradio_outputs")
36
+ results_dir.mkdir(parents=True, exist_ok=True)
37
+ final_output = results_dir / f"detected_{Path(video_file).stem}.mp4"
38
+ shutil.copy(output_video, final_output)
39
+
40
+ return str(final_output)
41
+
42
+ # Gradio interface
43
+ with gr.Blocks(title="Video Object Detection", theme=gr.themes.Soft()) as demo:
44
+ gr.Markdown("# Video Object Detection")
45
+ gr.Markdown("Upload a video, enter labels to detect, choose frame color, and download the processed video.")
46
+
47
+ with gr.Row():
48
+ with gr.Column():
49
+ video_input = gr.Video(label="Upload Video")
50
+ labels_input = gr.Textbox(label="Detection Labels (comma-separated)", placeholder="e.g., cat, dog, person")
51
+ color_input = gr.ColorPicker(label="Bounding Box Color", value="#FF0000")
52
+ process_btn = gr.Button("Process Video", variant="primary")
53
+
54
+ with gr.Column():
55
+ # Output section
56
+ gr.Markdown("## Output")
57
+
58
+ output_video = gr.Video(label="Processed Video", interactive=False)
59
+
60
+ download_button = gr.File(label="Download Processed Video", visible=False)
61
+
62
+ # Handle processing
63
+ def process_and_update(video, labels_text, frame_color):
64
+ try:
65
+ # Update status
66
+ gr.Info("Processing video... This may take a few minutes.")
67
+
68
+ output_path = process_video(video, labels_text, frame_color)
69
+
70
+ gr.Info("Video processing complete!")
71
+
72
+ return output_path, output_path
73
+ except Exception as e:
74
+ raise gr.Error(f"Processing failed: {str(e)}")
75
+
76
+ process_btn.click(
77
+ fn=process_and_update,
78
+ inputs=[video_input, labels_input, color_input],
79
+ outputs=[output_video, download_button]
80
+ )
81
+
82
+ if __name__ == "__main__":
83
+ demo.launch()
config.yaml CHANGED
@@ -1,18 +1,8 @@
1
- # Configuration file for pose estimation project
2
- # Add your configuration parameters below
3
-
4
  task: "object_detection" # Options: "pose", "hand"
5
-
6
- input_path: "D:\\youtube\\skiathos-sep 2025\\cats\\PXL_20250910_163543016.mp4"
7
-
8
  output_dir: ./results
9
-
10
  output_name: "cats.mp4"
11
-
12
  frames_dir: ./frames
 
 
13
 
14
- # Hand drawing parameters
15
- hand_drawing:
16
- radius: 20
17
- color_landmarks: [179, 124, 247] # BGR
18
- color_connections: [225, 225, 225] # BGR
 
 
 
 
1
  task: "object_detection" # Options: "pose", "hand"
2
+ input_path: "D:\\youtube\\skiathos-sep 2025\\cats\\cat_20250910_163543016.mp4"
 
 
3
  output_dir: ./results
 
4
  output_name: "cats.mp4"
 
5
  frames_dir: ./frames
6
+ frame_colour: "white"
7
+ labels: ["cat"]
8
 
 
 
 
 
 
main.py CHANGED
@@ -1,18 +1,8 @@
1
  #https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection
2
- from transformers import pipeline
3
- from transformers.image_utils import load_image
4
- from PIL import ImageDraw, Image, ImageFont
5
- from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
6
- import torch
7
  import yaml
8
  from pathlib import Path
9
  from datetime import datetime
10
- from src.utils import create_video_from_images
11
- import cv2
12
- import os
13
-
14
- device = "cuda" if torch.cuda.is_available() else "cpu"
15
- print(device)
16
 
17
  def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
18
  """Extract paths from configuration dictionary."""
@@ -63,75 +53,6 @@ def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
63
  return input_list, output_path, config["frames_subdirs"]
64
 
65
 
66
- def object_detection(path_video, output_folder):
67
- checkpoint = "iSEE-Laboratory/iSEE-Laboratory_llmdet_large" #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
68
- model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto")
69
- processor = AutoProcessor.from_pretrained(checkpoint)
70
-
71
- # Initialize video capture
72
- vidcap = cv2.VideoCapture(path_video)
73
-
74
- frame_count = 0
75
- # Initialize hand tracking
76
- while vidcap.isOpened():
77
- ret, frame = vidcap.read()
78
- if not ret:
79
- break
80
-
81
- print(f"Processing frame {frame_count}")
82
-
83
- # Convert the BGR image to RGB and ensure RGB mode
84
- rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
85
- image = Image.fromarray(rgb_frame).convert("RGB")
86
-
87
- # use a flat list of labels for single-image inference
88
- text_labels = ["cat"]
89
- inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device)
90
-
91
- with torch.no_grad():
92
- outputs = model(**inputs)
93
-
94
- # monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height)
95
-
96
-
97
- results = processor.post_process_grounded_object_detection(
98
- outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0]
99
-
100
- draw = ImageDraw.Draw(image)
101
-
102
- scores = results.get("scores", [])
103
- text_labels_res = results.get("text_labels", [])
104
- boxes = results.get("boxes", [])
105
-
106
- for box, score, text_label in zip(boxes, scores, text_labels_res):
107
- xmin, ymin, xmax, ymax = box
108
- draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=10)
109
- # convert score to float safely
110
- try:
111
- score_val = float(score)
112
- except Exception:
113
- score_val = round(score.item(), 2)
114
-
115
- # font_size = max(10, int(0.1 * image.height)) # 10% of image height, minimum 10 pixels
116
- #font = ImageFont.load_default(size=80)
117
- font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60)
118
- draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font)
119
- # save the annotated image (PIL image is modified in-place)
120
- image.save(f"{output_folder}/{frame_count}.png")
121
-
122
- # Exit loop by pressing 'q'
123
- if cv2.waitKey(1) & 0xFF == ord('q'):
124
- break
125
-
126
- frame_count += 1
127
- if frame_count == 90: # limit to first 30 frames
128
- break
129
-
130
- # Release the video capture and close windows
131
- vidcap.release()
132
- cv2.destroyAllWindows()
133
-
134
-
135
  def main():
136
 
137
  with open('config.yaml', 'r') as file:
@@ -140,7 +61,7 @@ def main():
140
  input_path_list, output_path, frames_subdirs = get_paths_from_config(config)
141
 
142
  for input_path, frames_dir in zip(input_path_list, frames_subdirs):
143
- object_detection(str(input_path), str(frames_dir))
144
 
145
  # path_video_frame_dirs = [config['frames_dir']+'/'+dir for dir in os.listdir(config['frames_dir'])]
146
  # output_path = [config['output_dir']+'/'+config['task']+'/'+dir+'.mp4' for dir in os.listdir(config['frames_dir'])]
 
1
  #https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection
 
 
 
 
 
2
  import yaml
3
  from pathlib import Path
4
  from datetime import datetime
5
+ from src.utils import create_video_from_images, object_detection
 
 
 
 
 
6
 
7
  def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
8
  """Extract paths from configuration dictionary."""
 
53
  return input_list, output_path, config["frames_subdirs"]
54
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def main():
57
 
58
  with open('config.yaml', 'r') as file:
 
61
  input_path_list, output_path, frames_subdirs = get_paths_from_config(config)
62
 
63
  for input_path, frames_dir in zip(input_path_list, frames_subdirs):
64
+ object_detection(str(input_path), str(frames_dir), config)
65
 
66
  # path_video_frame_dirs = [config['frames_dir']+'/'+dir for dir in os.listdir(config['frames_dir'])]
67
  # output_path = [config['output_dir']+'/'+config['task']+'/'+dir+'.mp4' for dir in os.listdir(config['frames_dir'])]
pyproject.toml CHANGED
@@ -6,10 +6,11 @@ readme = "README.md"
6
  requires-python = ">=3.9"
7
  dependencies = [
8
  "accelerate>=1.10.1",
 
9
  "moviepy>=2.2.1",
10
  "natsort>=8.4.0",
11
  "opencv-python>=4.12.0.88",
12
- "pillow>=11.3.0",
13
  "six>=1.17.0",
14
  "torch>=2.8.0",
15
  "transformers>=4.57.1",
 
6
  requires-python = ">=3.9"
7
  dependencies = [
8
  "accelerate>=1.10.1",
9
+ "gradio>=4.0.0",
10
  "moviepy>=2.2.1",
11
  "natsort>=8.4.0",
12
  "opencv-python>=4.12.0.88",
13
+ "pillow>=8.0,<11.0",
14
  "six>=1.17.0",
15
  "torch>=2.8.0",
16
  "transformers>=4.57.1",
src/utils.py CHANGED
@@ -1,6 +1,13 @@
1
  import os
2
  from moviepy import ImageSequenceClip
3
  from natsort import natsorted
 
 
 
 
 
 
 
4
 
5
  def create_video_from_images(folder_path, output_video_file, fps):
6
  """
@@ -44,4 +51,76 @@ def create_video_from_images(folder_path, output_video_file, fps):
44
 
45
  print(f"Successfully created video: '{output_video_file}'")
46
  except Exception as e:
47
- print(f"An error occurred while creating the video: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  from moviepy import ImageSequenceClip
3
  from natsort import natsorted
4
+ from transformers import pipeline
5
+ from transformers.image_utils import load_image
6
+ from PIL import ImageDraw, Image, ImageFont
7
+ from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
8
+ import torch
9
+ import cv2
10
+ import os
11
 
12
  def create_video_from_images(folder_path, output_video_file, fps):
13
  """
 
51
 
52
  print(f"Successfully created video: '{output_video_file}'")
53
  except Exception as e:
54
+ print(f"An error occurred while creating the video: {e}")
55
+
56
+
57
+ def object_detection(path_video, output_folder, config):
58
+
59
+ device = "cuda" if torch.cuda.is_available() else "cpu"
60
+ text_labels = config.get('labels', [])
61
+ frame_color = config.get('frame_colour')
62
+
63
+ checkpoint = "iSEE-Laboratory/llmdet_tiny" #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
64
+ model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto")
65
+ processor = AutoProcessor.from_pretrained(checkpoint)
66
+
67
+ # Initialize video capture
68
+ vidcap = cv2.VideoCapture(path_video)
69
+
70
+ frame_count = 0
71
+ # Initialize hand tracking
72
+ while vidcap.isOpened():
73
+ ret, frame = vidcap.read()
74
+ if not ret:
75
+ break
76
+
77
+ print(f"Processing frame {frame_count}")
78
+
79
+ # Convert the BGR image to RGB and ensure RGB mode
80
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
81
+ image = Image.fromarray(rgb_frame).convert("RGB")
82
+
83
+ inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device)
84
+
85
+ with torch.no_grad():
86
+ outputs = model(**inputs)
87
+
88
+ # monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height)
89
+
90
+
91
+ results = processor.post_process_grounded_object_detection(
92
+ outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0]
93
+
94
+ draw = ImageDraw.Draw(image)
95
+
96
+ scores = results.get("scores", [])
97
+ text_labels_res = results.get("text_labels", [])
98
+ boxes = results.get("boxes", [])
99
+
100
+ for box, score, text_label in zip(boxes, scores, text_labels_res):
101
+ xmin, ymin, xmax, ymax = box
102
+ draw.rectangle((xmin, ymin, xmax, ymax), outline=frame_color, width=10)
103
+ # convert score to float safely
104
+ try:
105
+ score_val = float(score)
106
+ except Exception:
107
+ score_val = round(score.item(), 2)
108
+
109
+ # font_size = max(10, int(0.1 * image.height)) # 10% of image height, minimum 10 pixels
110
+ #font = ImageFont.load_default(size=80)
111
+ font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60)
112
+ draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font)
113
+ # save the annotated image (PIL image is modified in-place)
114
+ image.save(f"{output_folder}/{frame_count}.png")
115
+
116
+ # Exit loop by pressing 'q'
117
+ if cv2.waitKey(1) & 0xFF == ord('q'):
118
+ break
119
+
120
+ frame_count += 1
121
+ if frame_count == 90: # limit to first 30 frames
122
+ break
123
+
124
+ # Release the video capture and close windows
125
+ vidcap.release()
126
+ cv2.destroyAllWindows()
uv.lock CHANGED
The diff for this file is too large to render. See raw diff