Spaces:

prithivMLmods
/

SAM3-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 19 days ago

Commit

57943d6

verified ·

1 Parent(s): c53c756

update app

Browse files

Files changed (1) hide show

app.py +43 -44

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import gc
 import cv2
 import tempfile
 import spaces
@@ -17,6 +16,7 @@ from transformers import (
     Sam3VideoModel, Sam3VideoProcessor
 )
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
@@ -79,45 +79,35 @@ class CustomBlueTheme(Soft):
 app_theme = CustomBlueTheme()
-MODEL_CACHE = {}
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using compute device: {device}")
-def clear_vram():
-    """Forces RAM/VRAM cleanup."""
-    if MODEL_CACHE:
-        print("🧹 Cleaning up memory...")
-        MODEL_CACHE.clear()
-    gc.collect()
-    torch.cuda.empty_cache()
-def load_segmentation_model(model_key):
-    """Lazy loads the specific SAM3 model required."""
-    if model_key in MODEL_CACHE:
-        return MODEL_CACHE[model_key]
-    clear_vram()
-    print(f"⏳ Loading {model_key}...")
-    try:
-        if model_key == "img_seg_model":
-            seg_model = Sam3Model.from_pretrained("facebook/sam3").to(device)
-            seg_processor = Sam3Processor.from_pretrained("facebook/sam3")
-            MODEL_CACHE[model_key] = (seg_model, seg_processor)
-        elif model_key == "vid_seg_model":
-            vid_model = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
-            vid_processor = Sam3VideoProcessor.from_pretrained("facebook/sam3")
-            MODEL_CACHE[model_key] = (vid_model, vid_processor)
-        print(f"✅ {model_key} loaded.")
-        return MODEL_CACHE[model_key]
-    except Exception as e:
-        print(f"❌ Error loading model: {e}")
-        clear_vram()
-        raise e
 def apply_mask_overlay(base_image, mask_data, opacity=0.5):
     """Draws segmentation masks on top of an image."""
     if isinstance(base_image, np.ndarray):
@@ -162,21 +152,27 @@ def apply_mask_overlay(base_image, mask_data, opacity=0.5):
     return Image.alpha_composite(base_image, composite_layer).convert("RGB")
 @spaces.GPU
 def run_image_segmentation(source_img, text_query, conf_thresh=0.5):
     if source_img is None or not text_query:
         raise gr.Error("Please provide an image and a text prompt.")
     try:
-        active_model, active_processor = load_segmentation_model("img_seg_model")
         pil_image = source_img.convert("RGB")
-        model_inputs = active_processor(images=pil_image, text=text_query, return_tensors="pt").to(device)
         with torch.no_grad():
-            inference_output = active_model(**model_inputs)
-        processed_results = active_processor.post_process_instance_segmentation(
             inference_output,
             threshold=conf_thresh,
             mask_threshold=0.5,
@@ -202,12 +198,13 @@ def calc_timeout_duration(vid_file, *args):
 @spaces.GPU(duration=calc_timeout_duration)
 def run_video_segmentation(source_vid, text_query, frame_limit, time_limit):
     if not source_vid or not text_query:
         raise gr.Error("Missing video or prompt.")
     try:
-        active_model, active_processor = load_segmentation_model("vid_seg_model")
         video_cap = cv2.VideoCapture(source_vid)
         vid_fps = video_cap.get(cv2.CAP_PROP_FPS)
         vid_w = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
@@ -222,14 +219,15 @@ def run_video_segmentation(source_vid, text_query, frame_limit, time_limit):
             counter += 1
         video_cap.release()
-        session = active_processor.init_video_session(video=video_frames, inference_device=device, dtype=torch.bfloat16)
-        session = active_processor.add_text_prompt(inference_session=session, text=text_query)
         temp_out_path = tempfile.mktemp(suffix=".mp4")
         video_writer = cv2.VideoWriter(temp_out_path, cv2.VideoWriter_fourcc(*'mp4v'), vid_fps, (vid_w, vid_h))
-        for model_out in active_model.propagate_in_video_iterator(inference_session=session, max_frame_num_to_track=len(video_frames)):
-            post_processed = active_processor.postprocess_outputs(session, model_out)
             f_idx = model_out.frame_idx
             original_pil = Image.fromarray(video_frames[f_idx])
@@ -248,6 +246,7 @@ def run_video_segmentation(source_vid, text_query, frame_limit, time_limit):
     except Exception as e:
         return None, f"Error during video processing: {str(e)}"
 custom_css="""
 #col-container { margin: 0 auto; max-width: 1100px; }
 #main-title h1 { font-size: 2.1em !important; }

 import os
 import cv2
 import tempfile
 import spaces
     Sam3VideoModel, Sam3VideoProcessor
 )
+# --- THEME CONFIGURATION ---
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
 app_theme = CustomBlueTheme()
+# --- GLOBAL MODEL LOADING ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"🖥️ Using compute device: {device}")
+print("⏳ Loading SAM3 Models permanently into memory...")
+try:
+    # 1. Load Image Segmentation Model
+    print("   ... Loading Image Model")
+    IMG_MODEL = Sam3Model.from_pretrained("facebook/sam3").to(device)
+    IMG_PROCESSOR = Sam3Processor.from_pretrained("facebook/sam3")
+    # 2. Load Video Segmentation Model
+    # Using bfloat16 for video to optimize VRAM usage while keeping speed
+    print("   ... Loading Video Model")
+    VID_MODEL = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
+    VID_PROCESSOR = Sam3VideoProcessor.from_pretrained("facebook/sam3")
+    print("✅ All Models loaded successfully!")
+except Exception as e:
+    print(f"❌ CRITICAL ERROR LOADING MODELS: {e}")
+    IMG_MODEL = None
+    VID_MODEL = None
+    IMG_PROCESSOR = None
+    VID_PROCESSOR = None
+# --- UTILS ---
 def apply_mask_overlay(base_image, mask_data, opacity=0.5):
     """Draws segmentation masks on top of an image."""
     if isinstance(base_image, np.ndarray):
     return Image.alpha_composite(base_image, composite_layer).convert("RGB")
+# --- GPU INFERENCE FUNCTIONS ---
 @spaces.GPU
 def run_image_segmentation(source_img, text_query, conf_thresh=0.5):
+    if IMG_MODEL is None or IMG_PROCESSOR is None:
+        raise gr.Error("Models failed to load on startup. Check logs.")
     if source_img is None or not text_query:
         raise gr.Error("Please provide an image and a text prompt.")
     try:
         pil_image = source_img.convert("RGB")
+        # Models are already on device, just move inputs
+        model_inputs = IMG_PROCESSOR(images=pil_image, text=text_query, return_tensors="pt").to(device)
         with torch.no_grad():
+            inference_output = IMG_MODEL(**model_inputs)
+        processed_results = IMG_PROCESSOR.post_process_instance_segmentation(
             inference_output,
             threshold=conf_thresh,
             mask_threshold=0.5,
 @spaces.GPU(duration=calc_timeout_duration)
 def run_video_segmentation(source_vid, text_query, frame_limit, time_limit):
+    if VID_MODEL is None or VID_PROCESSOR is None:
+        raise gr.Error("Video Models failed to load on startup.")
     if not source_vid or not text_query:
         raise gr.Error("Missing video or prompt.")
     try:
         video_cap = cv2.VideoCapture(source_vid)
         vid_fps = video_cap.get(cv2.CAP_PROP_FPS)
         vid_w = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
             counter += 1
         video_cap.release()
+        # VID_MODEL is already on device in bfloat16
+        session = VID_PROCESSOR.init_video_session(video=video_frames, inference_device=device, dtype=torch.bfloat16)
+        session = VID_PROCESSOR.add_text_prompt(inference_session=session, text=text_query)
         temp_out_path = tempfile.mktemp(suffix=".mp4")
         video_writer = cv2.VideoWriter(temp_out_path, cv2.VideoWriter_fourcc(*'mp4v'), vid_fps, (vid_w, vid_h))
+        for model_out in VID_MODEL.propagate_in_video_iterator(inference_session=session, max_frame_num_to_track=len(video_frames)):
+            post_processed = VID_PROCESSOR.postprocess_outputs(session, model_out)
             f_idx = model_out.frame_idx
             original_pil = Image.fromarray(video_frames[f_idx])
     except Exception as e:
         return None, f"Error during video processing: {str(e)}"
+# --- GUI ---
 custom_css="""
 #col-container { margin: 0 auto; max-width: 1100px; }
 #main-title h1 { font-size: 2.1em !important; }