import gradio as gr import pandas as pd from ultralytics import YOLO import cv2 import numpy as np from PIL import Image, ImageDraw import os # Load model and detection index print("Loading model and detection index...") model = YOLO("best.pt") detection_df = pd.read_parquet("detections.parquet") # Video path (you may need to download this at runtime or use URL) VIDEO_PATH = "data/videoplayback.mp4" VIDEO_URL = "YOUR_VIDEO_URL_HERE" # Replace with actual video URL or YouTube link def download_video_if_needed(): """Download video if not present""" if not os.path.exists(VIDEO_PATH): print(f"Video not found at {VIDEO_PATH}") print("Please upload video or provide YouTube URL") # You can add yt-dlp here to download from YouTube return False return True def merge_intervals(timestamps, gap_threshold=3.0): """Merge nearby timestamps into contiguous clips""" if not timestamps: return [] timestamps = sorted(list(set(timestamps))) clips = [] start = timestamps[0] prev = timestamps[0] for t in timestamps[1:]: if t - prev > gap_threshold: clips.append((start, prev)) start = t prev = t clips.append((start, prev)) return clips def retrieve_clips(query_image): """Main retrieval function""" if query_image is None: return "Please upload an image", None, None # Convert to PIL if needed if isinstance(query_image, np.ndarray): query_image = Image.fromarray(query_image) # Detect components in query image results = model(query_image, verbose=False)[0] if len(results.boxes) == 0: return "No car parts detected in the image", query_image, None # Draw boxes on query image query_draw = query_image.copy() draw = ImageDraw.Draw(query_draw) retrieval_info = [] all_clips = [] # Process each detected component for box_idx in range(len(results.boxes)): cls_id = int(results.boxes.cls[box_idx]) cls_name = model.names[cls_id] conf = float(results.boxes.conf[box_idx]) bbox = results.boxes.xyxy[box_idx].tolist() if conf < 0.5: continue # Draw bounding box x1, y1, x2, y2 = bbox draw.rectangle([x1, y1, x2, y2], outline='red', width=3) draw.text((x1, y1-20), f"{cls_name} ({conf:.2f})", fill='red') # Search detection index matches = detection_df[detection_df['class_label'] == cls_name] matches = matches[matches['confidence_score'] > 0.5] if len(matches) == 0: retrieval_info.append(f"❌ {cls_name}: No matches found") continue # Merge into clips timestamps = matches['timestamp'].tolist() clips = merge_intervals(timestamps, gap_threshold=3.0) retrieval_info.append( f"✅ {cls_name} (conf: {conf:.2%}): {len(clips)} clips, {len(matches)} frames" ) for start, end in clips[:3]: # Limit to first 3 clips per component all_clips.append({ 'component': cls_name, 'start': start, 'end': end, 'duration': end - start }) info_text = "\n".join(retrieval_info) # Create clips table if all_clips: clips_df = pd.DataFrame(all_clips) return info_text, query_draw, clips_df else: return info_text, query_draw, None def extract_frame(component, start_time): """Extract a frame from video at given timestamp""" if not download_video_if_needed(): return None cap = cv2.VideoCapture(VIDEO_PATH) fps = cap.get(cv2.CAP_PROP_FPS) frame_num = int(start_time * fps) cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num) ret, frame = cap.read() cap.release() if ret: frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) return Image.fromarray(frame_rgb) return None # Create Gradio interface with gr.Blocks(title="Image-to-Video Retrieval Demo") as demo: gr.Markdown(""" # 🚗 Car Parts Image-to-Video Retrieval System Upload an image of a car part, and this system will find matching video clips! **How it works:** 1. Upload a car image (doors, wheels, headlights, etc.) 2. YOLOv26s detects all car parts in your image 3. System retrieves matching video clips from the indexed video 4. View timestamps and sample frames **Supported Components:** Doors, wheels, headlights, mirrors, bumpers, and more! """) with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(type="pil", label="Upload Query Image") search_btn = gr.Button("🔍 Search Video", variant="primary") with gr.Column(scale=1): output_image = gr.Image(type="pil", label="Detected Components") output_text = gr.Textbox(label="Retrieval Results", lines=8) with gr.Row(): output_table = gr.Dataframe( label="Matching Video Clips", headers=["component", "start", "end", "duration"] ) gr.Markdown(""" --- ### 📊 Technical Details - **Model:** YOLOv26s fine-tuned on car parts dataset - **Video Sampling:** Every 5th frame - **Matching:** Semantic component matching with confidence ≥ 0.5 - **Clip Formation:** 3.0s gap threshold for temporal merging **Assignment 2 - CS-UY 4613 Artificial Intelligence** Hanze (James) Qiu | Spring 2026 """) # Connect button search_btn.click( fn=retrieve_clips, inputs=[input_image], outputs=[output_text, output_image, output_table] ) # Example images (optional - add paths to example images) gr.Examples( examples=[ # Add paths to example images if you have them # ["examples/car1.jpg"], # ["examples/car2.jpg"], ], inputs=input_image, label="Example Query Images" ) if __name__ == "__main__": print("Starting Gradio app...") demo.launch( server_name="0.0.0.0", server_port=7860, share=False )