Spaces:

yuvabe-ai
/

Ring_Size_Scalev2

Sleeping

App Files Files Community

Shri Jayaram commited on Oct 8, 2024

Commit

35ea9eb

1 Parent(s): 1c9286b

initial commit

Browse files

Files changed (8) hide show

FlowChart.png +0 -0
app.py +457 -0
arial.ttf +0 -0
florence.py +59 -0
requirements.txt +11 -0
sam.py +46 -0
sam2_hiera_s.yaml +117 -0
sam2_hiera_small.pt +3 -0

FlowChart.png ADDED Viewed

app.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import streamlit as st
+from PIL import Image, ImageDraw, ImageFont
+from typing import Optional
+import io
+from io import BytesIO
+import os
+import cv2
+import numpy as np
+import supervision as sv
+import matplotlib.pyplot as plt
+from rembg import remove
+import mediapipe as mp
+import torch
+from transformers import AutoProcessor, AutoModelForCausalLM
+from transformers.dynamic_module_utils import get_imports
+from unittest.mock import patch
+from scipy.spatial import distance as dist
+st.set_page_config(layout="wide", page_title="Ring Size Measurement")
+ring_size_dict = {
+    14.0: 3,
+    14.4: 3.5,
+    14.8: 4,
+    15.2: 4.5,
+    15.6: 5,
+    16.0: 5.5,
+    16.45: 6,
+    16.9: 6.5,
+    17.3: 7,
+    17.7: 7.5,
+    18.2: 8,
+    18.6: 8.5,
+    19.0: 9,
+    19.4: 9.5,
+    19.8: 10,
+    20.2: 10.5,
+    20.6: 11,
+    21.0: 11.5,
+    21.4: 12,
+    21.8: 12.5,
+    22.2: 13,
+    22.6: 13.5
+}
+# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# def fixed_get_imports(filename: str | os.PathLike) -> list[str]:
+#     if not str(filename).endswith("modeling_florence2.py"):
+#         return get_imports(filename)
+#     imports = get_imports(filename)
+#     imports.remove("flash_attn")
+#     return imports
+# def load_model():
+#     model_id = "microsoft/Florence-2-base-ft"
+#     processor = AutoProcessor.from_pretrained(model_id, torch_dtype=torch.qint8, trust_remote_code=True)
+#     try:
+#         os.mkdir("temp")
+#     except:
+#         pass
+#     with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
+#         model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="sdpa", trust_remote_code=True)
+#     Qmodel = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+#     return Qmodel.to(device), processor
+# if 'model_loaded' not in st.session_state:
+#     st.session_state.model_loaded = False
+# if not st.session_state.model_loaded:
+#     with st.spinner('Loading model...'):
+#         st.session_state.model, st.session_state.processor = load_model()
+#         st.session_state.model_loaded = True
+#         st.write("Model loaded complete")
+from florence import load_florence_model, run_florence_inference, \
+    FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
+from sam import load_sam_image_model, run_sam_inference
+if torch.cuda.is_available():
+    DEVICE = torch.device("cuda")
+    torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+    if torch.cuda.get_device_properties(0).major >= 8:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+else:
+    DEVICE = torch.device("cpu")
+FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
+SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
+def calculate_pixel_per_metric(image, known_diameter_of_coin=25):
+    original_image =  image.copy()
+    def extrac_the_obj(input_image: Image.Image, mask: Image.Image):
+        input_array = np.array(input_image)
+        mask_array = np.array(mask.convert("L"))
+        binary_mask = (mask_array > 0).astype("uint8")
+        output_array = np.zeros((*input_array.shape[:2], 4), dtype=np.uint8)
+        output_array[binary_mask == 1, :3] = input_array[binary_mask == 1]
+        output_array[binary_mask == 1, 3] = 255
+        return Image.fromarray(output_array, 'RGBA')
+    @torch.inference_mode()
+    @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+    def get_obj_mask(image_input, text_input) -> Optional[Image.Image]:
+        if image_input is None:
+            st.warning("Please upload an image.")
+            return None
+        if not text_input:
+            st.warning("Please enter a text prompt.")
+            return None
+        _, result = run_florence_inference(
+            model=FLORENCE_MODEL,
+            processor=FLORENCE_PROCESSOR,
+            device=DEVICE,
+            image=image_input,
+            task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
+            text=text_input
+        )
+        detections = sv.Detections.from_lmm(
+            lmm=sv.LMM.FLORENCE_2,
+            result=result,
+            resolution_wh=image_input.size
+        )
+        detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
+        if len(detections) == 0:
+            st.warning("No objects detected.")
+            return None
+        return Image.fromarray(detections.mask[0].astype("uint8") * 255)
+    # def plot_bbox(original_image, data):
+    #     # Create a copy of the original image to draw on
+    #     image_with_bboxes = original_image.copy()
+    #     # Use Pillow to draw bounding boxes and labels
+    #     draw = ImageDraw.Draw(image_with_bboxes)
+    #     def calculate_bbox_dimensions(bbox):
+    #         x1, y1, x2, y2 = bbox
+    #         width = x2 - x1
+    #         height = y2 - y1
+    #         return width, height
+    #     font = ImageFont.truetype("arial.ttf", 28)
+    #     for bbox, label in zip(data['bboxes'], data['labels']):
+    #         x1, y1, x2, y2 = bbox
+    #         draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
+    #         draw.text((x1, y1), label, fill="red", font=font)
+    #     width, height = calculate_bbox_dimensions(bbox)
+    #     print(f"Label: {label}, Width: {width}, Height: {height}")
+    #     dimension_text = f"W: {width}, H: {height}"
+    #     draw.text((x1, y1 + 20), dimension_text, fill="red", font=font)
+    #     real_world_dimension_mm = 160
+    #     largest_dimension = max(width, height)
+    #     pixels_per_mm = largest_dimension / real_world_dimension_mm
+    #     ratio_text = f"Pixels/mm: {pixels_per_mm:.2f}"
+    #     draw.text((x1, y1 + 40), ratio_text, fill="red", font=font)
+    def plot_bbox(the_obj, mask, known_length = 160):
+        input_array = np.array(the_obj)
+        mask_array = np.array(mask.convert("L"))
+        # Create binary mask
+        binary_mask = (mask_array > 0).astype("uint8")
+        output_array = np.zeros((*input_array.shape[:2], 4), dtype=np.uint8)
+        output_array[binary_mask == 1, :3] = input_array[binary_mask == 1]
+        output_array[binary_mask == 1, 3] = 255
+        contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        if contours:
+            heights = [cv2.boundingRect(contour)[3] for contour in contours]
+            m_ht = max(heights)
+        pixels_per_metric = m_ht / known_length
+        text = f"Pixel per Metric -> {pixels_per_metric:.2f} px/mm  Actual Ht.: {m_ht} px  Known Length : {known_length} mm"
+        print(text)
+        output_image = Image.fromarray(output_array, 'RGBA')
+        if text:
+            draw = ImageDraw.Draw(output_image)
+            font = ImageFont.truetype("arial.ttf", 34)  # You can specify a TTF font if you have one
+            text_position = (10, 10)  # You can change this position
+            draw.text(text_position, text, fill=(255, 255, 255, 255), font=font)
+        return output_image, pixels_per_metric, None
+        # return Image.fromarray(output_array, 'RGBA'), pixels_per_metric, None
+    def finding_ruler(image, task_prompt, text_input=None):
+        known_length = 160 # roughly 16cm so 160 mm
+        results = get_obj_mask(image, text_input=text_input)
+        # the_obj = extrac_the_obj(image, results)
+        image_with_bboxes, value_1, value_2 = plot_bbox(image, results, known_length)
+        return value_1, value_2, image_with_bboxes
+    image_for_model = image.copy()
+    image_for_model = cv2.cvtColor(image_for_model, cv2.COLOR_BGR2RGB)
+    image_for_model = Image.fromarray(image_for_model)
+    # if image_for_model.mode != 'RGB':
+    #     image_for_model = image_for_model.convert('RGB')
+    # Process the image
+    text_input = "ruler"
+    task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
+    pixel_per_metric, mm_per_pixel, marked_image_buf = finding_ruler(image_for_model, task_prompt, text_input)
+    return pixel_per_metric, mm_per_pixel, marked_image_buf
+def process_image(image):
+    return remove(image)
+def calculate_pip_width(image, original_img, pixel_per_metric):
+    def calSize(xA, yA, xB, yB, color_circle, color_line, img):
+        d = dist.euclidean((xA, yA), (xB, yB))
+        cv2.circle(img, (int(xA), int(yA)), 5, color_circle, -1)
+        cv2.circle(img, (int(xB), int(yB)), 5, color_circle, -1)
+        cv2.line(img, (int(xA), int(yA)), (int(xB), int(yB)), color_line, 2)
+        d_mm = d / pixel_per_metric
+        d_mm = d_mm - 1.5
+        cv2.putText(img, "{:.1f}".format(d_mm), (int(xA - 15), int(yA - 10)), cv2.FONT_HERSHEY_SIMPLEX, 0.65, (255, 255, 255), 2)
+        # print(d_mm)
+        return d_mm
+    def process_point(point, cnt, m1, b):
+        x1, x2 = point[0], point[0]
+        y1 = m1 * x1 + b
+        y2 = m1 * x2 + b
+        result = 1.0
+        while result > 0:
+            result = cv2.pointPolygonTest(cnt, (x1, y1), False)
+            x1 += 1
+            y1 = m1 * x1 + b
+        x1 -= 1
+        result = 1.0
+        while result > 0:
+            result = cv2.pointPolygonTest(cnt, (x2, y2), False)
+            x2 -= 1
+            y2 = m1 * x2 + b
+        x2 += 1
+        return x1, y1, x2, y2
+    og_img = original_img.copy()
+    imgH, imgW, _ = image.shape
+    imgcpy = image.copy()
+    image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    _, binary_image = cv2.threshold(image_gray, 1, 255, cv2.THRESH_BINARY)
+    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    contour_image = np.zeros_like(image_gray)
+    cv2.drawContours(contour_image, contours, -1, (255), thickness=cv2.FILLED)
+    cv2.drawContours(imgcpy, contours, -1, (0, 255, 0), 2)
+    # print("length : ",len(contours))
+    marked_img = image.copy()
+    if len(contours) > 0:
+        cnt = max(contours, key=cv2.contourArea)
+        frame2 = cv2.cvtColor(og_img, cv2.COLOR_BGR2RGB)
+        handsLM = mp.solutions.hands.Hands(max_num_hands=1, min_detection_confidence=0.8, min_tracking_confidence=0.8)
+        pr = handsLM.process(frame2)
+        # print(pr.multi_hand_landmarks)
+        if pr.multi_hand_landmarks:
+            for hand_landmarks in pr.multi_hand_landmarks:
+                lmlist = []
+                for id, landMark in enumerate(hand_landmarks.landmark):
+                    xPos, yPos = int(landMark.x * imgW), int(landMark.y * imgH)
+                    lmlist.append([id, xPos, yPos])
+                if len(lmlist) != 0:
+                    pip_joint = [lmlist[14][1], lmlist[14][2]]
+                    mcp_joint = [lmlist[13][1], lmlist[13][2]]
+                    midpoint_x = (pip_joint[0] + mcp_joint[0]) / 2
+                    midpoint_y = (pip_joint[1] + mcp_joint[1]) / 2
+                    midpoint = [midpoint_x, midpoint_y]
+                    m2 = (pip_joint[1] - mcp_joint[1]) / (pip_joint[0] - mcp_joint[0])
+                    m1 = -1 / m2
+                    b = pip_joint[1] - m1 * pip_joint[0]
+                    #pip_joint
+                    x1_pip, y1_pip, x2_pip, y2_pip = process_point(pip_joint, cnt, m1, b)
+                    m2 = (midpoint_y - mcp_joint[1]) / (midpoint_x - mcp_joint[0])
+                    m1 = -1 / m2
+                    b = midpoint_y - m1 * midpoint_x
+                    #midpoint
+                    x1_mid, y1_mid, x2_mid, y2_mid = process_point(midpoint, cnt, m1, b)
+                    d_mm_pip = calSize(x1_pip, y1_pip, x2_pip, y2_pip, (255, 0, 0), (255, 0, 255), original_img)
+                    d_mm_mid = calSize(x1_mid, y1_mid, x2_mid, y2_mid, (0, 255, 0), (0, 0, 255), original_img)
+    largest_d_mm = max(int(d_mm_mid),int(d_mm_pip))
+    return original_img, largest_d_mm, imgcpy, marked_img
+def mark_hand_landmarks(image_path):
+    mp_hands = mp.solutions.hands
+    hands = mp_hands.Hands()
+    mp_draw = mp.solutions.drawing_utils
+    img = image_path
+    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    results = hands.process(img_rgb)
+    if results.multi_hand_landmarks:
+        for hand_landmarks in results.multi_hand_landmarks:
+            mp_draw.draw_landmarks(img, hand_landmarks, mp_hands.HAND_CONNECTIONS)
+            mcp = hand_landmarks.landmark[13]
+            pip = hand_landmarks.landmark[14]
+            img_height, img_width, _ = img.shape
+            mcp_x, mcp_y = int(mcp.x * img_width), int(mcp.y * img_height)
+            pip_x, pip_y = int(pip.x * img_width), int(pip.y * img_height)
+            cv2.circle(img, (mcp_x, mcp_y), 10, (255, 0, 0), -1)
+            cv2.circle(img, (pip_x, pip_y), 10, (255, 0, 0), -1)
+    return img
+def show_resized_image(images, titles, scale=0.5):
+    num_images = len(images)
+    fig, axes = plt.subplots(2, 3, figsize=(17, 13))
+    axes = axes.flatten()
+    for ax in axes[num_images:]:
+        ax.axis('off')
+    i = 0
+    for ax, img, title in zip(axes, images, titles):
+        i = i + 1
+        print(i)
+        resized_image = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
+        ax.imshow(cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB))
+        ax.set_title(title)
+        ax.axis('off')
+    plt.tight_layout()
+    img_stream = BytesIO()
+    plt.savefig(img_stream, format='png')
+    img_stream.seek(0)
+    plt.close(fig)
+    return img_stream
+def get_ring_size(mm_value):
+    if mm_value in ring_size_dict:
+        return ring_size_dict[mm_value]
+    else:
+        closest_mm = min(ring_size_dict.keys(), key=lambda x: abs(x - mm_value))
+        return ring_size_dict[closest_mm]
+# st.set_page_config(layout="wide", page_title="Ring Size Measurement")
+st.write("## Determine Your Ring Size")
+st.write(
+    "📏 Upload an image of your hand to measure the finger width and determine your ring size. The measurement will be displayed along with a visual breakdown of the image processing flow."
+)
+st.sidebar.write("## Upload :gear:")
+#~~
+st.write("### Workflow Overview")
+st.image("FlowChart.png", caption="Workflow Overview", use_column_width=True)
+st.write("### Detailed Workflow")
+st.write("1. **Florence-2 Model:** Florence-2 is an advanced vision foundation model that uses a prompt-based approach to handle a wide range of vision and vision-language tasks.We utilize this model to detect the scale within the image and mark a bounding box which we can use to find the approximate full measurement of scale.")
+st.write("2. **Pixel Per Metric Ratio:** The Pixel Per Metric Ratio is used to convert pixel measurements into real-world units. By comparing the pixel length obtained from image analysis (i.e., Hough Circle) with the known real-world measurement of the reference object (coin), we get the ratio. This ratio then allows us to accurately scale and size estimation of objects within the image.")
+st.write("3. **Background Removal:** Removing the background first ensures that only the relevant subject is highlighted. We start by converting the image to grayscale and applying thresholding to distinguish the subject from the background. Erosion and dilation then clean up the image, improving the detection of specific features like individual fingers.")
+st.write("4. **Contour Detection:** We use Contour Detection to find the largest contour, which allows us to outline or draw a boundary around the subject (i.e., hand). This highlights the object's shape and edges, improving the precision of the subject.")
+st.write("5. **Finding Hand Landmarks:** This involves using the MediaPipe library to identify key points on the hand, such as the PIP (Proximal Interphalangeal) and MCP (Metacarpophalangeal) joints of the ring finger. This enables precise tracking and analysis of finger positions and movements.")
+st.write("6. **Determining Finger Width:** Here we use the slope formula `[y = mx + b]` with PIP and MCP points to measure the finger's width. We project outward perpendicularly from the PIP point towards the MCP point, then apply a point polygon test to accurately determine the pixel width of the finger.")
+st.write("7. **Predicting Ring Size:** Predicting Ring Size involves calculating the finger’s diameter using the Pixel Per Metric Ratio and the largest width measurement at the PIP or MCP joint. This diameter is then used to predict the appropriate ring size.")
+#~~
+MAX_FILE_SIZE = 5 * 1024 * 1024  # 5MB
+def process_image_and_get_results(upload):
+    image = Image.open(upload)
+    # image =  cv2.imread(upload)
+    image_np = np.array(image)
+    image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
+    original_img = image_np.copy()
+    og_img1 = image_np.copy()
+    og_img2 = image_np.copy()
+    img_1 = image_np.copy()
+    hand_lms = mark_hand_landmarks(img_1)
+    pixel_per_metric, mm_per_pixel, image_with_coin_info = calculate_pixel_per_metric(image_np)
+    processed_image = process_image(og_img1)
+    image_with_pip_width, width_mm, contour_image, pip_mark_img = calculate_pip_width(processed_image, original_img, pixel_per_metric)
+    image_with_coin_info = np.array(image_with_coin_info)
+    if image_with_coin_info is None:
+        print("inside1")
+        raise ValueError("Image is None, cannot resize.")
+    elif not isinstance(image_with_coin_info, (np.ndarray, cv2.UMat)):
+        print("inside2")
+        raise TypeError(f"Invalid image type: {type(image_with_coin_info)}. Expected numpy array or cv2.UMat.")
+    ring_size = get_ring_size(width_mm)
+    return {
+        "processed_image": image_with_pip_width,
+        "original_image": og_img2,
+        "hand_lm_marked_image": hand_lms,
+        "image_with_coin_info": image_with_coin_info,
+        "contour_image": contour_image,
+        "width_mm": width_mm,
+        "ring_size": ring_size
+    }
+def show_how_it_works(processed_image):
+    st.write("## How It Works")
+    st.write("Here's a step-by-step breakdown of how your image is processed to determine your ring size:")
+    st.image(processed_image, caption="Image Processing Flow", use_column_width=True)
+col1, col2 = st.columns(2)
+my_upload = st.sidebar.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
+if my_upload is not None:
+    if my_upload.size > MAX_FILE_SIZE:
+        st.error("The uploaded file is too large. Please upload an image smaller than 5MB.")
+    else:
+        st.write("## Image Processing Flow")
+        results = process_image_and_get_results(my_upload)
+        col1.write("Uploaded Image :camera:")
+        col1.image(cv2.cvtColor(results["original_image"], cv2.COLOR_BGR2RGB), caption="Uploaded Image")
+        col2.write("Processed Image :wrench:")
+        col2.image(cv2.cvtColor(results["processed_image"], cv2.COLOR_BGR2RGB), caption="Processed Image with PIP Width")
+        st.write(f"📏 The width of your finger is {results['width_mm']:.2f} mm, and the estimated ring size is {results['ring_size']:.1f}.")
+        if st.button("How it Works"):
+            st.write("## How It Works")
+            st.write("Here's a step-by-step breakdown of how your image is processed to determine your ring size:")
+            print("here")
+            img_stream = show_resized_image(
+                [results["original_image"], results["image_with_coin_info"], results["contour_image"], results["hand_lm_marked_image"], results["processed_image"]],
+                ['Original Image', 'Image with Scale Info', 'Contour Boundary Image', 'Hand Landmarks', 'Ring Finger Width'],
+                scale=0.5
+            )
+            st.image(img_stream, caption="Processing Flow", use_column_width=True)
+else:
+    st.info("Please upload an image to get started.")

arial.ttf ADDED Viewed

Binary file (312 kB). View file

florence.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+from typing import Union, Any, Tuple, Dict
+from unittest.mock import patch
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+from transformers.dynamic_module_utils import get_imports
+FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
+# FLORENCE_CHECKPOINT = "microsoft/Florence-2-large"
+FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
+FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
+FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
+FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
+FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
+def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
+    """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
+    if not str(filename).endswith("/modeling_florence2.py"):
+        return get_imports(filename)
+    imports = get_imports(filename)
+    imports.remove("flash_attn")
+    return imports
+def load_florence_model(
+    device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
+) -> Tuple[Any, Any]:
+    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
+        model = AutoModelForCausalLM.from_pretrained(
+            checkpoint, trust_remote_code=True).to(device).eval()
+        processor = AutoProcessor.from_pretrained(
+            checkpoint, trust_remote_code=True)
+        return model, processor
+def run_florence_inference(
+    model: Any,
+    processor: Any,
+    device: torch.device,
+    image: Image,
+    task: str,
+    text: str = ""
+) -> Tuple[str, Dict]:
+    prompt = task + text
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        num_beams=3
+    )
+    generated_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=False)[0]
+    response = processor.post_process_generation(
+        generated_text, task=task, image_size=image.size)
+    return generated_text, response

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+streamlit
+Pillow
+numpy
+opencv-python
+supervision
+matplotlib
+rembg
+mediapipe
+torch
+transformers
+scipy

sam.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import Any
+import numpy as np
+import supervision as sv
+import torch
+from PIL import Image
+from sam2.build_sam import build_sam2, build_sam2_video_predictor
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+SAM_CHECKPOINT = "sam2_hiera_small.pt"
+SAM_CONFIG = "sam2_hiera_s.yaml"
+# SAM_CHECKPOINT = "checkpoints/sam2_hiera_large.pt"
+# SAM_CONFIG = "sam2_hiera_l.yaml"
+def load_sam_image_model(
+    device: torch.device,
+    config: str = SAM_CONFIG,
+    checkpoint: str = SAM_CHECKPOINT
+) -> SAM2ImagePredictor:
+    model = build_sam2(config, checkpoint, device=device)
+    return SAM2ImagePredictor(sam_model=model)
+def load_sam_video_model(
+    device: torch.device,
+    config: str = SAM_CONFIG,
+    checkpoint: str = SAM_CHECKPOINT
+) -> Any:
+    return build_sam2_video_predictor(config, checkpoint, device=device)
+def run_sam_inference(
+    model: Any,
+    image: Image,
+    detections: sv.Detections
+) -> sv.Detections:
+    image = np.array(image.convert("RGB"))
+    model.set_image(image)
+    mask, score, _ = model.predict(box=detections.xyxy, multimask_output=False)
+    if len(mask.shape) == 4:
+        mask = np.squeeze(mask)
+    detections.mask = mask.astype(bool)
+    return detections

sam2_hiera_s.yaml ADDED Viewed

	@@ -0,0 +1,117 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 11, 2]
+      global_att_blocks: [7, 10, 13]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

sam2_hiera_small.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95949964d4e548409021d47b22712d5f1abf2564cc0c3c765ba599a24ac7dce3
+size 184309650