Spaces:
Sleeping
Sleeping
Commit
·
8023e2e
1
Parent(s):
754ab40
web app
Browse files- app.py +80 -4
- config.yaml +3 -13
- main.py +2 -81
- pyproject.toml +2 -1
- src/utils.py +80 -1
- uv.lock +0 -0
app.py
CHANGED
|
@@ -1,7 +1,83 @@
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
return "Hello " + name + "!!"
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import tempfile
|
| 3 |
+
import shutil
|
| 4 |
+
from pathlib import Path
|
| 5 |
|
| 6 |
+
from src.utils import create_video_from_images, object_detection
|
|
|
|
| 7 |
|
| 8 |
+
def process_video(video_file, labels_text, frame_color):
|
| 9 |
+
# Parse labels
|
| 10 |
+
text_labels = [label.strip() for label in labels_text.split(',') if label.strip()]
|
| 11 |
+
|
| 12 |
+
if not text_labels:
|
| 13 |
+
raise gr.Error("Please enter at least one label")
|
| 14 |
+
|
| 15 |
+
# Create config
|
| 16 |
+
config = {
|
| 17 |
+
'labels': text_labels,
|
| 18 |
+
'frame_colour': frame_color
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# Create temporary directories
|
| 22 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 23 |
+
temp_path = Path(temp_dir)
|
| 24 |
+
frames_dir = temp_path / "frames"
|
| 25 |
+
frames_dir.mkdir()
|
| 26 |
+
output_video = temp_path / "output.mp4"
|
| 27 |
+
|
| 28 |
+
# Process video to frames
|
| 29 |
+
object_detection(str(video_file), str(frames_dir), config)
|
| 30 |
+
|
| 31 |
+
# Create video from frames
|
| 32 |
+
create_video_from_images(str(frames_dir), str(output_video), fps=30)
|
| 33 |
+
|
| 34 |
+
# Copy to a permanent location for download
|
| 35 |
+
results_dir = Path("./results/gradio_outputs")
|
| 36 |
+
results_dir.mkdir(parents=True, exist_ok=True)
|
| 37 |
+
final_output = results_dir / f"detected_{Path(video_file).stem}.mp4"
|
| 38 |
+
shutil.copy(output_video, final_output)
|
| 39 |
+
|
| 40 |
+
return str(final_output)
|
| 41 |
+
|
| 42 |
+
# Gradio interface
|
| 43 |
+
with gr.Blocks(title="Video Object Detection", theme=gr.themes.Soft()) as demo:
|
| 44 |
+
gr.Markdown("# Video Object Detection")
|
| 45 |
+
gr.Markdown("Upload a video, enter labels to detect, choose frame color, and download the processed video.")
|
| 46 |
+
|
| 47 |
+
with gr.Row():
|
| 48 |
+
with gr.Column():
|
| 49 |
+
video_input = gr.Video(label="Upload Video")
|
| 50 |
+
labels_input = gr.Textbox(label="Detection Labels (comma-separated)", placeholder="e.g., cat, dog, person")
|
| 51 |
+
color_input = gr.ColorPicker(label="Bounding Box Color", value="#FF0000")
|
| 52 |
+
process_btn = gr.Button("Process Video", variant="primary")
|
| 53 |
+
|
| 54 |
+
with gr.Column():
|
| 55 |
+
# Output section
|
| 56 |
+
gr.Markdown("## Output")
|
| 57 |
+
|
| 58 |
+
output_video = gr.Video(label="Processed Video", interactive=False)
|
| 59 |
+
|
| 60 |
+
download_button = gr.File(label="Download Processed Video", visible=False)
|
| 61 |
+
|
| 62 |
+
# Handle processing
|
| 63 |
+
def process_and_update(video, labels_text, frame_color):
|
| 64 |
+
try:
|
| 65 |
+
# Update status
|
| 66 |
+
gr.Info("Processing video... This may take a few minutes.")
|
| 67 |
+
|
| 68 |
+
output_path = process_video(video, labels_text, frame_color)
|
| 69 |
+
|
| 70 |
+
gr.Info("Video processing complete!")
|
| 71 |
+
|
| 72 |
+
return output_path, output_path
|
| 73 |
+
except Exception as e:
|
| 74 |
+
raise gr.Error(f"Processing failed: {str(e)}")
|
| 75 |
+
|
| 76 |
+
process_btn.click(
|
| 77 |
+
fn=process_and_update,
|
| 78 |
+
inputs=[video_input, labels_input, color_input],
|
| 79 |
+
outputs=[output_video, download_button]
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
if __name__ == "__main__":
|
| 83 |
+
demo.launch()
|
config.yaml
CHANGED
|
@@ -1,18 +1,8 @@
|
|
| 1 |
-
# Configuration file for pose estimation project
|
| 2 |
-
# Add your configuration parameters below
|
| 3 |
-
|
| 4 |
task: "object_detection" # Options: "pose", "hand"
|
| 5 |
-
|
| 6 |
-
input_path: "D:\\youtube\\skiathos-sep 2025\\cats\\PXL_20250910_163543016.mp4"
|
| 7 |
-
|
| 8 |
output_dir: ./results
|
| 9 |
-
|
| 10 |
output_name: "cats.mp4"
|
| 11 |
-
|
| 12 |
frames_dir: ./frames
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
# Hand drawing parameters
|
| 15 |
-
hand_drawing:
|
| 16 |
-
radius: 20
|
| 17 |
-
color_landmarks: [179, 124, 247] # BGR
|
| 18 |
-
color_connections: [225, 225, 225] # BGR
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
task: "object_detection" # Options: "pose", "hand"
|
| 2 |
+
input_path: "D:\\youtube\\skiathos-sep 2025\\cats\\cat_20250910_163543016.mp4"
|
|
|
|
|
|
|
| 3 |
output_dir: ./results
|
|
|
|
| 4 |
output_name: "cats.mp4"
|
|
|
|
| 5 |
frames_dir: ./frames
|
| 6 |
+
frame_colour: "white"
|
| 7 |
+
labels: ["cat"]
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
CHANGED
|
@@ -1,18 +1,8 @@
|
|
| 1 |
#https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection
|
| 2 |
-
from transformers import pipeline
|
| 3 |
-
from transformers.image_utils import load_image
|
| 4 |
-
from PIL import ImageDraw, Image, ImageFont
|
| 5 |
-
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
| 6 |
-
import torch
|
| 7 |
import yaml
|
| 8 |
from pathlib import Path
|
| 9 |
from datetime import datetime
|
| 10 |
-
from src.utils import create_video_from_images
|
| 11 |
-
import cv2
|
| 12 |
-
import os
|
| 13 |
-
|
| 14 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 15 |
-
print(device)
|
| 16 |
|
| 17 |
def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
|
| 18 |
"""Extract paths from configuration dictionary."""
|
|
@@ -63,75 +53,6 @@ def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
|
|
| 63 |
return input_list, output_path, config["frames_subdirs"]
|
| 64 |
|
| 65 |
|
| 66 |
-
def object_detection(path_video, output_folder):
|
| 67 |
-
checkpoint = "iSEE-Laboratory/iSEE-Laboratory_llmdet_large" #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
|
| 68 |
-
model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto")
|
| 69 |
-
processor = AutoProcessor.from_pretrained(checkpoint)
|
| 70 |
-
|
| 71 |
-
# Initialize video capture
|
| 72 |
-
vidcap = cv2.VideoCapture(path_video)
|
| 73 |
-
|
| 74 |
-
frame_count = 0
|
| 75 |
-
# Initialize hand tracking
|
| 76 |
-
while vidcap.isOpened():
|
| 77 |
-
ret, frame = vidcap.read()
|
| 78 |
-
if not ret:
|
| 79 |
-
break
|
| 80 |
-
|
| 81 |
-
print(f"Processing frame {frame_count}")
|
| 82 |
-
|
| 83 |
-
# Convert the BGR image to RGB and ensure RGB mode
|
| 84 |
-
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 85 |
-
image = Image.fromarray(rgb_frame).convert("RGB")
|
| 86 |
-
|
| 87 |
-
# use a flat list of labels for single-image inference
|
| 88 |
-
text_labels = ["cat"]
|
| 89 |
-
inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device)
|
| 90 |
-
|
| 91 |
-
with torch.no_grad():
|
| 92 |
-
outputs = model(**inputs)
|
| 93 |
-
|
| 94 |
-
# monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height)
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
results = processor.post_process_grounded_object_detection(
|
| 98 |
-
outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0]
|
| 99 |
-
|
| 100 |
-
draw = ImageDraw.Draw(image)
|
| 101 |
-
|
| 102 |
-
scores = results.get("scores", [])
|
| 103 |
-
text_labels_res = results.get("text_labels", [])
|
| 104 |
-
boxes = results.get("boxes", [])
|
| 105 |
-
|
| 106 |
-
for box, score, text_label in zip(boxes, scores, text_labels_res):
|
| 107 |
-
xmin, ymin, xmax, ymax = box
|
| 108 |
-
draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=10)
|
| 109 |
-
# convert score to float safely
|
| 110 |
-
try:
|
| 111 |
-
score_val = float(score)
|
| 112 |
-
except Exception:
|
| 113 |
-
score_val = round(score.item(), 2)
|
| 114 |
-
|
| 115 |
-
# font_size = max(10, int(0.1 * image.height)) # 10% of image height, minimum 10 pixels
|
| 116 |
-
#font = ImageFont.load_default(size=80)
|
| 117 |
-
font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60)
|
| 118 |
-
draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font)
|
| 119 |
-
# save the annotated image (PIL image is modified in-place)
|
| 120 |
-
image.save(f"{output_folder}/{frame_count}.png")
|
| 121 |
-
|
| 122 |
-
# Exit loop by pressing 'q'
|
| 123 |
-
if cv2.waitKey(1) & 0xFF == ord('q'):
|
| 124 |
-
break
|
| 125 |
-
|
| 126 |
-
frame_count += 1
|
| 127 |
-
if frame_count == 90: # limit to first 30 frames
|
| 128 |
-
break
|
| 129 |
-
|
| 130 |
-
# Release the video capture and close windows
|
| 131 |
-
vidcap.release()
|
| 132 |
-
cv2.destroyAllWindows()
|
| 133 |
-
|
| 134 |
-
|
| 135 |
def main():
|
| 136 |
|
| 137 |
with open('config.yaml', 'r') as file:
|
|
@@ -140,7 +61,7 @@ def main():
|
|
| 140 |
input_path_list, output_path, frames_subdirs = get_paths_from_config(config)
|
| 141 |
|
| 142 |
for input_path, frames_dir in zip(input_path_list, frames_subdirs):
|
| 143 |
-
object_detection(str(input_path), str(frames_dir))
|
| 144 |
|
| 145 |
# path_video_frame_dirs = [config['frames_dir']+'/'+dir for dir in os.listdir(config['frames_dir'])]
|
| 146 |
# output_path = [config['output_dir']+'/'+config['task']+'/'+dir+'.mp4' for dir in os.listdir(config['frames_dir'])]
|
|
|
|
| 1 |
#https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import yaml
|
| 3 |
from pathlib import Path
|
| 4 |
from datetime import datetime
|
| 5 |
+
from src.utils import create_video_from_images, object_detection
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
def get_paths_from_config(config: dict) -> tuple[Path, Path, Path]:
|
| 8 |
"""Extract paths from configuration dictionary."""
|
|
|
|
| 53 |
return input_list, output_path, config["frames_subdirs"]
|
| 54 |
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
def main():
|
| 57 |
|
| 58 |
with open('config.yaml', 'r') as file:
|
|
|
|
| 61 |
input_path_list, output_path, frames_subdirs = get_paths_from_config(config)
|
| 62 |
|
| 63 |
for input_path, frames_dir in zip(input_path_list, frames_subdirs):
|
| 64 |
+
object_detection(str(input_path), str(frames_dir), config)
|
| 65 |
|
| 66 |
# path_video_frame_dirs = [config['frames_dir']+'/'+dir for dir in os.listdir(config['frames_dir'])]
|
| 67 |
# output_path = [config['output_dir']+'/'+config['task']+'/'+dir+'.mp4' for dir in os.listdir(config['frames_dir'])]
|
pyproject.toml
CHANGED
|
@@ -6,10 +6,11 @@ readme = "README.md"
|
|
| 6 |
requires-python = ">=3.9"
|
| 7 |
dependencies = [
|
| 8 |
"accelerate>=1.10.1",
|
|
|
|
| 9 |
"moviepy>=2.2.1",
|
| 10 |
"natsort>=8.4.0",
|
| 11 |
"opencv-python>=4.12.0.88",
|
| 12 |
-
"pillow>=11.
|
| 13 |
"six>=1.17.0",
|
| 14 |
"torch>=2.8.0",
|
| 15 |
"transformers>=4.57.1",
|
|
|
|
| 6 |
requires-python = ">=3.9"
|
| 7 |
dependencies = [
|
| 8 |
"accelerate>=1.10.1",
|
| 9 |
+
"gradio>=4.0.0",
|
| 10 |
"moviepy>=2.2.1",
|
| 11 |
"natsort>=8.4.0",
|
| 12 |
"opencv-python>=4.12.0.88",
|
| 13 |
+
"pillow>=8.0,<11.0",
|
| 14 |
"six>=1.17.0",
|
| 15 |
"torch>=2.8.0",
|
| 16 |
"transformers>=4.57.1",
|
src/utils.py
CHANGED
|
@@ -1,6 +1,13 @@
|
|
| 1 |
import os
|
| 2 |
from moviepy import ImageSequenceClip
|
| 3 |
from natsort import natsorted
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
def create_video_from_images(folder_path, output_video_file, fps):
|
| 6 |
"""
|
|
@@ -44,4 +51,76 @@ def create_video_from_images(folder_path, output_video_file, fps):
|
|
| 44 |
|
| 45 |
print(f"Successfully created video: '{output_video_file}'")
|
| 46 |
except Exception as e:
|
| 47 |
-
print(f"An error occurred while creating the video: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from moviepy import ImageSequenceClip
|
| 3 |
from natsort import natsorted
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
from transformers.image_utils import load_image
|
| 6 |
+
from PIL import ImageDraw, Image, ImageFont
|
| 7 |
+
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
| 8 |
+
import torch
|
| 9 |
+
import cv2
|
| 10 |
+
import os
|
| 11 |
|
| 12 |
def create_video_from_images(folder_path, output_video_file, fps):
|
| 13 |
"""
|
|
|
|
| 51 |
|
| 52 |
print(f"Successfully created video: '{output_video_file}'")
|
| 53 |
except Exception as e:
|
| 54 |
+
print(f"An error occurred while creating the video: {e}")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def object_detection(path_video, output_folder, config):
|
| 58 |
+
|
| 59 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 60 |
+
text_labels = config.get('labels', [])
|
| 61 |
+
frame_color = config.get('frame_colour')
|
| 62 |
+
|
| 63 |
+
checkpoint = "iSEE-Laboratory/llmdet_tiny" #"openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
|
| 64 |
+
model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint, device_map="auto")
|
| 65 |
+
processor = AutoProcessor.from_pretrained(checkpoint)
|
| 66 |
+
|
| 67 |
+
# Initialize video capture
|
| 68 |
+
vidcap = cv2.VideoCapture(path_video)
|
| 69 |
+
|
| 70 |
+
frame_count = 0
|
| 71 |
+
# Initialize hand tracking
|
| 72 |
+
while vidcap.isOpened():
|
| 73 |
+
ret, frame = vidcap.read()
|
| 74 |
+
if not ret:
|
| 75 |
+
break
|
| 76 |
+
|
| 77 |
+
print(f"Processing frame {frame_count}")
|
| 78 |
+
|
| 79 |
+
# Convert the BGR image to RGB and ensure RGB mode
|
| 80 |
+
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 81 |
+
image = Image.fromarray(rgb_frame).convert("RGB")
|
| 82 |
+
|
| 83 |
+
inputs = processor(text=text_labels, images=image, return_tensors="pt").to(device)
|
| 84 |
+
|
| 85 |
+
with torch.no_grad():
|
| 86 |
+
outputs = model(**inputs)
|
| 87 |
+
|
| 88 |
+
# monkeypatch ImageDraw.text to accept a `fontsize` argument (absolute pixels or fraction of image height)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
results = processor.post_process_grounded_object_detection(
|
| 92 |
+
outputs, threshold=0.50, target_sizes=[(image.height, image.width)])[0]
|
| 93 |
+
|
| 94 |
+
draw = ImageDraw.Draw(image)
|
| 95 |
+
|
| 96 |
+
scores = results.get("scores", [])
|
| 97 |
+
text_labels_res = results.get("text_labels", [])
|
| 98 |
+
boxes = results.get("boxes", [])
|
| 99 |
+
|
| 100 |
+
for box, score, text_label in zip(boxes, scores, text_labels_res):
|
| 101 |
+
xmin, ymin, xmax, ymax = box
|
| 102 |
+
draw.rectangle((xmin, ymin, xmax, ymax), outline=frame_color, width=10)
|
| 103 |
+
# convert score to float safely
|
| 104 |
+
try:
|
| 105 |
+
score_val = float(score)
|
| 106 |
+
except Exception:
|
| 107 |
+
score_val = round(score.item(), 2)
|
| 108 |
+
|
| 109 |
+
# font_size = max(10, int(0.1 * image.height)) # 10% of image height, minimum 10 pixels
|
| 110 |
+
#font = ImageFont.load_default(size=80)
|
| 111 |
+
font = ImageFont.truetype("fonts/Perfect DOS VGA 437.ttf", size=60)
|
| 112 |
+
draw.text((xmin, ymin), f"{text_label}: {round(score_val,2)}", fill="black", stroke_width=1, stroke_fill="black", font=font)
|
| 113 |
+
# save the annotated image (PIL image is modified in-place)
|
| 114 |
+
image.save(f"{output_folder}/{frame_count}.png")
|
| 115 |
+
|
| 116 |
+
# Exit loop by pressing 'q'
|
| 117 |
+
if cv2.waitKey(1) & 0xFF == ord('q'):
|
| 118 |
+
break
|
| 119 |
+
|
| 120 |
+
frame_count += 1
|
| 121 |
+
if frame_count == 90: # limit to first 30 frames
|
| 122 |
+
break
|
| 123 |
+
|
| 124 |
+
# Release the video capture and close windows
|
| 125 |
+
vidcap.release()
|
| 126 |
+
cv2.destroyAllWindows()
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|