Spaces:

hasanbasbunar
/

SAM3

Running on Zero

App Files Files Community

hasanbasbunar commited on about 1 month ago

Commit

5e513b8

verified ·

1 Parent(s): 8f0a7b7

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -50

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import torch
 import numpy as np
 import cv2
-from PIL import Image
 import matplotlib.pyplot as plt
 import matplotlib
 import tempfile
@@ -115,12 +115,28 @@ def get_first_frame(video_path):
         return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
     return None
 # --- HELPERS POUR DUREE DYNAMIQUE ZEROGPU ---
 def compute_duration_text(video_path, text_prompt, max_frames, timeout_seconds):
     return timeout_seconds
-def compute_duration_tracker(video_path, x, y, max_frames, timeout_seconds):
     return timeout_seconds
 # --- LOGIQUE AVEC DÉCORATEURS ZEROGPU ---
@@ -144,11 +160,10 @@ def process_image_text(image, text_prompt, threshold, mask_threshold):
     except Exception as e:
         return image, f"Error: {str(e)}"
-# MODIFICATION IMPORTANTE : Cette fonction GPU ne prend plus 'evt', mais 'x' et 'y' directement
 @spaces.GPU
 def process_image_tracker_gpu(image, x, y, points_state, labels_state, multimask):
     if image is None: return image, [], []
-    # x et y sont maintenant des entiers simples
     if points_state is None: points_state = []; labels_state = []
     points_state.append([x, y])
     labels_state.append(1)
@@ -166,21 +181,18 @@ def process_image_tracker_gpu(image, x, y, points_state, labels_state, multimask
              best_idx = np.argmax(scores)
              masks_to_show = masks_to_show[best_idx:best_idx+1]
         final_img = overlay_masks(image, masks_to_show)
-        draw = Image.fromarray(np.array(final_img))
-        import PIL.ImageDraw
-        d = PIL.ImageDraw.Draw(draw)
-        for pt in points_state:
-            d.ellipse((pt[0]-5, pt[1]-5, pt[0]+5, pt[1]+5), fill="red", outline="white")
-        return draw, points_state, labels_state
     except Exception as e:
         print(f"Tracker Error: {e}")
         return image, points_state, labels_state
-# WRAPPER CPU POUR IMAGE TRACKER : Extrait les données avant d'appeler le GPU
 def process_image_tracker_wrapper(image, evt: gr.SelectData, points_state, labels_state, multimask):
     if evt is None: return image, points_state, labels_state
     x, y = evt.index
-    # Appel de la fonction GPU avec des types simples
     return process_image_tracker_gpu(image, x, y, points_state, labels_state, multimask)
@@ -220,9 +232,36 @@ def process_video_text(video_path, text_prompt, max_frames, timeout_seconds):
         return output_path, "Done!"
     except Exception as e: return None, f"Error: {str(e)}"
-# MODIFICATION IMPORTANTE : Cette fonction GPU ne prend plus 'first_frame_click' (objet complexe) mais x, y
 @spaces.GPU(duration=compute_duration_tracker)
-def process_video_tracker_gpu(video_path, x, y, max_frames, timeout_seconds):
     try:
         model, processor = get_model("sam3_video_tracker")
         cap = cv2.VideoCapture(video_path)
@@ -238,7 +277,18 @@ def process_video_tracker_gpu(video_path, x, y, max_frames, timeout_seconds):
             frame_count += 1
         cap.release()
         inference_session = processor.init_video_session(video=frames, inference_device=device, dtype=torch.bfloat16)
-        processor.add_inputs_to_inference_session(inference_session=inference_session, frame_idx=0, obj_ids=1, input_points=[[[[x, y]]]], input_labels=[[[1]]])
         output_path = tempfile.mktemp(suffix=".mp4")
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
@@ -257,25 +307,6 @@ def process_video_tracker_gpu(video_path, x, y, max_frames, timeout_seconds):
         print(f"Video Tracker Error: {e}")
         return None, f"Fatal Error: {str(e)}"
-# WRAPPER CPU POUR VIDEO TRACKER
-def process_video_tracker_wrapper(video_path, first_frame_click, max_frames, timeout_seconds):
-    if not video_path or not first_frame_click: return None, "Please click on the first frame."
-    # Extraction des données simples ici, sur le CPU
-    if hasattr(first_frame_click, 'index'):
-        x, y = first_frame_click.index
-    else:
-        return None, "Click error."
-    # Appel de la fonction GPU avec des entiers
-    return process_video_tracker_gpu(video_path, x, y, max_frames, timeout_seconds)
-# NOUVEAU WRAPPER POUR L'AUTO-START (Select Event)
-def video_select_trigger(video_path, max_frames, duration, evt: gr.SelectData):
-    # On lance le traitement directement avec l'event du clic
-    output_video, status = process_video_tracker_wrapper(video_path, evt, max_frames, duration)
-    # On retourne aussi l'event pour mettre à jour le state "click_state" au cas où
-    return output_video, status, evt
 # --- INTERFACE GRADIO ---
 with gr.Blocks(title="SAM3 Ultimate Suite") as demo:
@@ -313,7 +344,6 @@ with gr.Blocks(title="SAM3 Ultimate Suite") as demo:
                 with gr.Column():
                     i2_output = gr.Image(type="pil", label="Interactive Result")
-            # APPEL DU WRAPPER CPU, PAS DE LA FONCTION GPU DIRECTEMENT
             i2_input.select(process_image_tracker_wrapper, [i2_input, points_state, labels_state, i2_multimask], [i2_output, points_state, labels_state])
             i2_clear.click(lambda: (None, [], []), outputs=[i2_output, points_state, labels_state])
@@ -325,46 +355,60 @@ with gr.Blocks(title="SAM3 Ultimate Suite") as demo:
                     v3_input = gr.Video(label="Input Video", format="mp4")
                     v3_text = gr.Textbox(label="Text Prompt", placeholder="e.g.: person, car")
                     v3_max_frames = gr.Slider(10, 300, value=50, step=10, label="Max Frames to Process")
-                    # Ajout choix durée
                     v3_duration = gr.Radio([60, 120], value=60, label="Max Processing Time (seconds)", info="Choose 60s for short clips, 120s for complex tasks")
                     v3_btn = gr.Button("Start Video Segmentation", variant="primary")
                 with gr.Column():
                     v3_output = gr.Video(label="Result Video")
                     v3_status = gr.Textbox(label="Status")
-            # Ajout v3_duration aux inputs
             v3_btn.click(process_video_text, [v3_input, v3_text, v3_max_frames, v3_duration], [v3_output, v3_status])
         # TAB 4 : VIDEO + TRACKER
         with gr.Tab("🎯 Video - Visual Tracker"):
-            gr.Markdown("### Track a specific object in video\n1. Upload a video.\n2. Wait for the first frame to appear below.\n3. Click on the object you want to track.\n4. Processing starts automatically.")
             with gr.Row():
                 with gr.Column():
                     v4_input = gr.Video(label="Input Video", format="mp4")
-                    v4_frame0 = gr.Image(label="First Frame (Click the object here)", interactive=True)
                     v4_max_frames = gr.Slider(10, 300, value=50, step=10, label="Max Frames to Process")
                     v4_duration = gr.Radio([60, 120], value=60, label="Max Processing Time (seconds)", info="Choose 60s for short clips, 120s for complex tasks")
                     with gr.Row():
                         v4_btn = gr.Button("Start Object Tracking", variant="primary")
-                        v4_clear = gr.Button("Reset Tracking") # Nouveau bouton Reset
                 with gr.Column():
                     v4_output = gr.Video(label="Result Video")
                     v4_status = gr.Textbox(label="Status")
-            v4_input.change(get_first_frame, inputs=v4_input, outputs=v4_frame0)
-            click_state = gr.State()
-            # AUTO-START : Le clic déclenche directement le tracking + sauvegarde l'état
             v4_frame0.select(
-                video_select_trigger,
-                inputs=[v4_input, v4_max_frames, v4_duration],
-                outputs=[v4_output, v4_status, click_state]
             )
-            # Bouton manuel (utilise l'état sauvegardé par le clic)
-            v4_btn.click(process_video_tracker_wrapper, [v4_input, click_state, v4_max_frames, v4_duration], [v4_output, v4_status])
-            # Bouton Reset
-            v4_clear.click(lambda: (None, "", None), outputs=[v4_output, v4_status, click_state])
 if __name__ == "__main__":
     demo.launch(share=False, debug=True, theme=gr.themes.Soft())

 import torch
 import numpy as np
 import cv2
+from PIL import Image, ImageDraw
 import matplotlib.pyplot as plt
 import matplotlib
 import tempfile
         return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
     return None
+def draw_points_on_image(image, points):
+    """Dessine des points rouges sur l'image pour feedback visuel."""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    # Créer une copie pour dessiner
+    draw_img = image.copy()
+    draw = ImageDraw.Draw(draw_img)
+    for pt in points:
+        x, y = pt
+        r = 5
+        draw.ellipse((x-r, y-r, x+r, y+r), fill="red", outline="white")
+    return draw_img
 # --- HELPERS POUR DUREE DYNAMIQUE ZEROGPU ---
 def compute_duration_text(video_path, text_prompt, max_frames, timeout_seconds):
     return timeout_seconds
+def compute_duration_tracker(video_path, points_state, labels_state, max_frames, timeout_seconds):
     return timeout_seconds
 # --- LOGIQUE AVEC DÉCORATEURS ZEROGPU ---
     except Exception as e:
         return image, f"Error: {str(e)}"
+# Image Tracker avec Multi-points
 @spaces.GPU
 def process_image_tracker_gpu(image, x, y, points_state, labels_state, multimask):
     if image is None: return image, [], []
     if points_state is None: points_state = []; labels_state = []
     points_state.append([x, y])
     labels_state.append(1)
              best_idx = np.argmax(scores)
              masks_to_show = masks_to_show[best_idx:best_idx+1]
         final_img = overlay_masks(image, masks_to_show)
+        # Dessiner les points
+        final_img = draw_points_on_image(final_img, points_state)
+        return final_img, points_state, labels_state
     except Exception as e:
         print(f"Tracker Error: {e}")
         return image, points_state, labels_state
 def process_image_tracker_wrapper(image, evt: gr.SelectData, points_state, labels_state, multimask):
     if evt is None: return image, points_state, labels_state
     x, y = evt.index
     return process_image_tracker_gpu(image, x, y, points_state, labels_state, multimask)
         return output_path, "Done!"
     except Exception as e: return None, f"Error: {str(e)}"
+# --- VIDEO TRACKER MULTI-POINT ---
+# Fonction CPU pour ajouter un point VISUELLEMENT (sans appeler le GPU)
+def add_point_video_preview(video_path, evt: gr.SelectData, points_state, labels_state):
+    """Ajoute un point à la liste et met à jour l'image de preview avec un point rouge."""
+    if not video_path: return None, points_state, labels_state
+    # Récupérer la frame originale brute (sans points)
+    # Pour faire simple ici, on la recharge à chaque fois.
+    # Optimisation possible: stocker l'image originale dans un State.
+    orig_frame = get_first_frame(video_path)
+    if orig_frame is None: return None, points_state, labels_state
+    orig_img = Image.fromarray(orig_frame)
+    x, y = evt.index
+    if points_state is None: points_state = []; labels_state = []
+    points_state.append([x, y])
+    labels_state.append(1)
+    # Dessiner TOUS les points sur l'image originale
+    preview_img = draw_points_on_image(orig_img, points_state)
+    return preview_img, points_state, labels_state
 @spaces.GPU(duration=compute_duration_tracker)
+def process_video_tracker_gpu(video_path, points_state, labels_state, max_frames, timeout_seconds):
+    if not video_path or not points_state: return None, "Please click on the frame first."
     try:
         model, processor = get_model("sam3_video_tracker")
         cap = cv2.VideoCapture(video_path)
             frame_count += 1
         cap.release()
         inference_session = processor.init_video_session(video=frames, inference_device=device, dtype=torch.bfloat16)
+        # Envoi de TOUS les points accumulés
+        input_points = [[points_state]] # [Obj=1 [Points...]]
+        input_labels = [[labels_state]]
+        processor.add_inputs_to_inference_session(
+            inference_session=inference_session,
+            frame_idx=0,
+            obj_ids=1,
+            input_points=input_points,
+            input_labels=input_labels
+        )
         output_path = tempfile.mktemp(suffix=".mp4")
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
         print(f"Video Tracker Error: {e}")
         return None, f"Fatal Error: {str(e)}"
 # --- INTERFACE GRADIO ---
 with gr.Blocks(title="SAM3 Ultimate Suite") as demo:
                 with gr.Column():
                     i2_output = gr.Image(type="pil", label="Interactive Result")
             i2_input.select(process_image_tracker_wrapper, [i2_input, points_state, labels_state, i2_multimask], [i2_output, points_state, labels_state])
             i2_clear.click(lambda: (None, [], []), outputs=[i2_output, points_state, labels_state])
                     v3_input = gr.Video(label="Input Video", format="mp4")
                     v3_text = gr.Textbox(label="Text Prompt", placeholder="e.g.: person, car")
                     v3_max_frames = gr.Slider(10, 300, value=50, step=10, label="Max Frames to Process")
                     v3_duration = gr.Radio([60, 120], value=60, label="Max Processing Time (seconds)", info="Choose 60s for short clips, 120s for complex tasks")
                     v3_btn = gr.Button("Start Video Segmentation", variant="primary")
                 with gr.Column():
                     v3_output = gr.Video(label="Result Video")
                     v3_status = gr.Textbox(label="Status")
             v3_btn.click(process_video_text, [v3_input, v3_text, v3_max_frames, v3_duration], [v3_output, v3_status])
         # TAB 4 : VIDEO + TRACKER
         with gr.Tab("🎯 Video - Visual Tracker"):
+            gr.Markdown("### Track a specific object in video (Multi-point Support)\n1. Upload a video.\n2. Click on the object in the 'First Frame'. **You can click multiple times** to refine the selection.\n3. Click 'Start Object Tracking' when ready.")
             with gr.Row():
                 with gr.Column():
                     v4_input = gr.Video(label="Input Video", format="mp4")
+                    v4_frame0 = gr.Image(label="First Frame (Click to add points)", interactive=True)
                     v4_max_frames = gr.Slider(10, 300, value=50, step=10, label="Max Frames to Process")
                     v4_duration = gr.Radio([60, 120], value=60, label="Max Processing Time (seconds)", info="Choose 60s for short clips, 120s for complex tasks")
                     with gr.Row():
                         v4_btn = gr.Button("Start Object Tracking", variant="primary")
+                        v4_clear = gr.Button("Reset Tracking")
+                    # États pour stocker les points multiples
+                    v4_points_state = gr.State([])
+                    v4_labels_state = gr.State([])
                 with gr.Column():
                     v4_output = gr.Video(label="Result Video")
                     v4_status = gr.Textbox(label="Status")
+            # --- CORRECTION ICI ---
+            # Fusion des deux événements pour éviter le conflit (affichage vs reset)
+            def on_video_upload(video_path):
+                # 1. On récupère l'image
+                frame = get_first_frame(video_path)
+                # 2. On reset les états (points et labels vides)
+                # Retourne : Image, Points vides, Labels vides
+                return frame, [], []
+            v4_input.change(on_video_upload, inputs=v4_input, outputs=[v4_frame0, v4_points_state, v4_labels_state])
+            # ----------------------
+            # 1. Clic -> Ajout point visuel (CPU) + Mise à jour State
             v4_frame0.select(
+                add_point_video_preview,
+                inputs=[v4_input, v4_points_state, v4_labels_state],
+                outputs=[v4_frame0, v4_points_state, v4_labels_state]
             )
+            # 2. Bouton Start -> Envoi de la liste complète des points au GPU
+            v4_btn.click(process_video_tracker_gpu, [v4_input, v4_points_state, v4_labels_state, v4_max_frames, v4_duration], [v4_output, v4_status])
+            # 3. Bouton Reset -> Vide les points, recharge l'image vierge
+            def reset_tracking_view(video_path):
+                img = get_first_frame(video_path)
+                return None, "", [], [], img
+            v4_clear.click(reset_tracking_view, inputs=[v4_input], outputs=[v4_output, v4_status, v4_points_state, v4_labels_state, v4_frame0])
 if __name__ == "__main__":
     demo.launch(share=False, debug=True, theme=gr.themes.Soft())