Justin331 commited on Nov 25, 2025

Commit

3e8dd07

verified ·

1 Parent(s): d63a70e

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
sam3/__init__.py +7 -0
sam3/agent/__init__.py +1 -0
sam3/agent/agent_core.py +563 -0
sam3/agent/client_llm.py +205 -0
sam3/agent/client_sam3.py +138 -0
sam3/agent/helpers/__init__.py +1 -0
sam3/agent/helpers/boxes.py +438 -0
sam3/agent/helpers/color_map.py +150 -0
sam3/agent/helpers/keypoints.py +244 -0
sam3/agent/helpers/mask_overlap_removal.py +128 -0
sam3/agent/helpers/masks.py +560 -0
sam3/agent/helpers/memory.py +87 -0
sam3/agent/helpers/rle.py +122 -0
sam3/agent/helpers/roi_align.py +75 -0
sam3/agent/helpers/rotated_boxes.py +533 -0
sam3/agent/helpers/som_utils.py +406 -0
sam3/agent/helpers/visualizer.py +1662 -0
sam3/agent/helpers/zoom_in.py +195 -0
sam3/agent/inference.py +65 -0
sam3/agent/system_prompts/system_prompt.txt +242 -0
sam3/agent/system_prompts/system_prompt_iterative_checking.txt +26 -0
sam3/agent/viz.py +114 -0
sam3/eval/__init__.py +1 -0
sam3/eval/cgf1_eval.py +703 -0
sam3/eval/coco_eval.py +916 -0
sam3/eval/coco_eval_offline.py +181 -0
sam3/eval/coco_reindex.py +230 -0
sam3/eval/coco_writer.py +352 -0
sam3/eval/conversion_util.py +211 -0
sam3/eval/demo_eval.py +658 -0
sam3/eval/hota_eval_toolkit/__init__.py +1 -0
sam3/eval/hota_eval_toolkit/run_ytvis_eval.py +114 -0
sam3/eval/hota_eval_toolkit/trackeval/__init__.py +4 -0
sam3/eval/hota_eval_toolkit/trackeval/_timing.py +68 -0
sam3/eval/hota_eval_toolkit/trackeval/datasets/__init__.py +4 -0
sam3/eval/hota_eval_toolkit/trackeval/datasets/_base_dataset.py +379 -0
sam3/eval/hota_eval_toolkit/trackeval/datasets/tao_ow.py +891 -0
sam3/eval/hota_eval_toolkit/trackeval/datasets/youtube_vis.py +524 -0
sam3/eval/hota_eval_toolkit/trackeval/eval.py +395 -0
sam3/eval/hota_eval_toolkit/trackeval/metrics/__init__.py +4 -0
sam3/eval/hota_eval_toolkit/trackeval/metrics/_base_metric.py +145 -0
sam3/eval/hota_eval_toolkit/trackeval/metrics/count.py +48 -0
sam3/eval/hota_eval_toolkit/trackeval/metrics/hota.py +291 -0
sam3/eval/hota_eval_toolkit/trackeval/utils.py +195 -0
sam3/eval/postprocessors.py +648 -0
sam3/eval/saco_veval_eval.py +155 -0
sam3/eval/saco_veval_evaluators.py +838 -0
sam3/eval/teta_eval_toolkit/__init__.py +5 -0
sam3/eval/teta_eval_toolkit/_timing.py +69 -0

.gitattributes CHANGED Viewed

@@ -1,2 +1,3 @@
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
+sam3/perflib/tests/assets/masks.tiff filter=lfs diff=lfs merge=lfs -text

sam3/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+from .model_builder import build_sam3_image_model
+__version__ = "0.1.0"
+__all__ = ["build_sam3_image_model"]

sam3/agent/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved

sam3/agent/agent_core.py ADDED Viewed

	@@ -0,0 +1,563 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import copy
+import json
+import os
+import cv2
+from PIL import Image
+from .client_llm import send_generate_request
+from .client_sam3 import call_sam_service
+from .viz import visualize
+def save_debug_messages(messages_list, debug, debug_folder_path, debug_jsonl_path):
+    """Save messages to debug jsonl file if debug is enabled"""
+    if debug and debug_jsonl_path:
+        # Ensure the debug directory exists before writing
+        os.makedirs(debug_folder_path, exist_ok=True)
+        with open(debug_jsonl_path, "w") as f:
+            for msg in messages_list:
+                f.write(json.dumps(msg, indent=4) + "\n")
+def cleanup_debug_files(debug, debug_folder_path, debug_jsonl_path):
+    """Clean up debug files when function successfully returns"""
+    if debug and debug_folder_path:
+        try:
+            if os.path.exists(debug_jsonl_path):
+                os.remove(debug_jsonl_path)
+            if os.path.exists(debug_folder_path):
+                os.rmdir(debug_folder_path)
+        except Exception as e:
+            print(f"Warning: Could not clean up debug files: {e}")
+def count_images(messages):
+    """Count the total number of images present in the messages history."""
+    total = 0
+    for message in messages:
+        # Check if message has content (should be a list)
+        if "content" in message and isinstance(message["content"], list):
+            # Iterate through each content item
+            for content_item in message["content"]:
+                # Check if content item is a dict with type "image"
+                if (
+                    isinstance(content_item, dict)
+                    and content_item.get("type") == "image"
+                ):
+                    total += 1
+    return total
+def _prune_messages_for_next_round(
+    messages_list,
+    used_text_prompts,
+    latest_sam3_text_prompt,
+    img_path,
+    initial_text_prompt,
+):
+    """Return a new messages list that contains only:
+    1) messages[:2] (with optional warning text added to the second message's content)
+    2) the latest assistant message (and everything after it) that contains a segment_phrase tool call
+    """
+    # There should not be more than 10 messages in the conversation history
+    assert len(messages_list) < 10
+    # Part 1: always keep the first two message JSONs
+    part1 = copy.deepcopy(messages_list[:2])
+    # Part 2: search backwards for the latest assistant message containing a segment_phrase tool call
+    part2_start_idx = None
+    for idx in range(len(messages_list) - 1, 1, -1):
+        msg = messages_list[idx]
+        # We only consider assistant messages with a "content" list
+        if msg.get("role") != "assistant" or "content" not in msg:
+            continue
+        # Look for any content element that is a text containing the segment_phrase tool call
+        for content in msg["content"]:
+            if (
+                isinstance(content, dict)
+                and content.get("type") == "text"
+                and "<tool>" in content.get("text", "")
+                and "segment_phrase" in content.get("text", "")
+            ):
+                part2_start_idx = idx
+                break
+        if part2_start_idx is not None:
+            break
+    part2 = messages_list[part2_start_idx:] if part2_start_idx is not None else []
+    # Part 3: decide whether to add warning text to the second message in part1
+    previously_used = (
+        [p for p in used_text_prompts if p != latest_sam3_text_prompt]
+        if latest_sam3_text_prompt
+        else list(used_text_prompts)
+    )
+    if part2 and len(previously_used) > 0:
+        warning_text = f'Note that we have previously called the segment_phrase tool with each "text_prompt" in this list: {list(previously_used)}, but none of the generated results were satisfactory. So make sure that you do not use any of these phrases as the "text_prompt" to call the segment_phrase tool again.'
+        # Replace the second message entirely to keep exactly 2 content items
+        part1[1] = {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": img_path},
+                {
+                    "type": "text",
+                    "text": f"The above image is the raw input image. The initial user input query is: '{initial_text_prompt}'."
+                    + " "
+                    + warning_text,
+                },
+            ],
+        }
+        assert len(part1[1]["content"]) == 2
+    # Build the new messages list: part1 (with optional warning), then part2
+    new_messages = list(part1)
+    new_messages.extend(part2)
+    return new_messages
+def agent_inference(
+    img_path: str,
+    initial_text_prompt: str,
+    debug: bool = False,
+    send_generate_request=send_generate_request,
+    call_sam_service=call_sam_service,
+    max_generations: int = 100,
+    output_dir="../../sam3_agent_out",
+):
+    """
+    Given a text prompt and an image, this tool will perform all aspects of agentic problem solving,
+    while saving sam3 and MLLM outputs to their respective directories.
+    Args:
+        img_path: Path to the input image
+        initial_text_prompt: Initial text prompt from the user
+        debug: Whether to enable debug mode
+        max_generations: Maximum number of send_generate_request calls allowed (default: 100)
+    """
+    # setup dir
+    sam_output_dir = os.path.join(output_dir, "sam_out")
+    error_save_dir = os.path.join(output_dir, "none_out")
+    debug_save_dir = os.path.join(output_dir, "agent_debug_out")
+    os.makedirs(sam_output_dir, exist_ok=True)
+    os.makedirs(error_save_dir, exist_ok=True)
+    os.makedirs(debug_save_dir, exist_ok=True)
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    MLLM_SYSTEM_PROMPT_PATH = os.path.join(
+        current_dir, "system_prompts/system_prompt.txt"
+    )
+    ITERATIVE_CHECKING_SYSTEM_PROMPT_PATH = os.path.join(
+        current_dir, "system_prompts/system_prompt_iterative_checking.txt"
+    )
+    # init variables
+    PATH_TO_LATEST_OUTPUT_JSON = ""
+    LATEST_SAM3_TEXT_PROMPT = ""
+    USED_TEXT_PROMPTS = (
+        set()
+    )  # Track all previously used text prompts for segment_phrase
+    generation_count = 0  # Counter for number of send_generate_request calls
+    # debug setup
+    debug_folder_path = None
+    debug_jsonl_path = None
+    if debug:
+        debug_folder_path = os.path.join(
+            debug_save_dir, f"{img_path.rsplit('/', 1)[-1].rsplit('.', 1)[0]}"
+        )
+        debug_jsonl_path = os.path.join(debug_folder_path, "debug_history.json")
+        os.makedirs(debug_folder_path, exist_ok=True)
+    # The helper functions are now defined outside the agent_inference function
+    with open(MLLM_SYSTEM_PROMPT_PATH, "r") as f:
+        system_prompt = f.read().strip()
+    with open(ITERATIVE_CHECKING_SYSTEM_PROMPT_PATH, "r") as f:
+        iterative_checking_system_prompt = f.read().strip()
+    # Construct the initial message list
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": img_path},
+                {
+                    "type": "text",
+                    "text": f"The above image is the raw input image. The initial user input query is: '{initial_text_prompt}'.",
+                },
+            ],
+        },
+    ]
+    print(f"> Text prompt: {initial_text_prompt}")
+    print(f"> Image path: {img_path}")
+    print("\n\n")
+    print("-" * 30 + f" Round {str(generation_count + 1)}" + "-" * 30)
+    print("\n\n")
+    generated_text = send_generate_request(messages)
+    print(f"\n>>> MLLM Response [start]\n{generated_text}\n<<< MLLM Response [end]\n")
+    while generated_text is not None:
+        save_debug_messages(messages, debug, debug_folder_path, debug_jsonl_path)
+        assert (
+            "<tool>" in generated_text,
+            f"Generated text does not contain <tool> tag: {generated_text}",
+        )
+        generated_text = generated_text.split("</tool>", 1)[0] + "</tool>"
+        tool_call_json_str = (
+            generated_text.split("<tool>")[-1]
+            .split("</tool>")[0]
+            .strip()
+            .replace(r"}}}", r"}}")  # remove extra } if any
+        )
+        try:
+            tool_call = json.loads(tool_call_json_str)
+        except json.JSONDecodeError:
+            raise ValueError(f"Invalid JSON in tool call: {tool_call_json_str}")
+        if PATH_TO_LATEST_OUTPUT_JSON == "":
+            # The first tool call must be segment_phrase or report_no_mask
+            assert (
+                tool_call["name"] == "segment_phrase"
+                or tool_call["name"] == "report_no_mask"
+            )
+        if tool_call["name"] == "segment_phrase":
+            print("🔍 Calling segment_phrase tool...")
+            assert list(tool_call["parameters"].keys()) == ["text_prompt"]
+            # Check if this text_prompt has been used before
+            current_text_prompt = tool_call["parameters"]["text_prompt"]
+            if current_text_prompt in USED_TEXT_PROMPTS:
+                print(
+                    f"❌ Text prompt '{current_text_prompt}' has been used before. Requesting a different prompt."
+                )
+                duplicate_prompt_message = f"You have previously used '{current_text_prompt}' as your text_prompt to call the segment_phrase tool. You may not use it again. Please call the segment_phrase tool again with a different, perhaps more general, or more creative simple noun phrase prompt, while adhering to all the rules stated in the system prompt. You must also never use any of the following text_prompt(s): {str(list(USED_TEXT_PROMPTS))}."
+                messages.append(
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "text": generated_text}],
+                    }
+                )
+                messages.append(
+                    {
+                        "role": "user",
+                        "content": [{"type": "text", "text": duplicate_prompt_message}],
+                    }
+                )
+            else:
+                # Add the text_prompt to the set of used prompts
+                USED_TEXT_PROMPTS.add(current_text_prompt)
+                LATEST_SAM3_TEXT_PROMPT = current_text_prompt
+                PATH_TO_LATEST_OUTPUT_JSON = call_sam_service(
+                    image_path=img_path,
+                    text_prompt=current_text_prompt,
+                    output_folder_path=sam_output_dir,
+                )
+                sam3_outputs = json.load(open(PATH_TO_LATEST_OUTPUT_JSON, "r"))
+                sam3_output_image_path = sam3_outputs["output_image_path"]
+                num_masks = len(sam3_outputs["pred_boxes"])
+                messages.append(
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "text": generated_text}],
+                    }
+                )
+                if num_masks == 0:
+                    print("❌ No masks generated by SAM3, reporting no mask to Qwen.")
+                    sam3_output_text_message = f"The segment_phrase tool did not generate any masks for the text_prompt '{current_text_prompt}'. Now, please call the segment_phrase tool again with a different, perhaps more general, or more creative simple noun phrase text_prompt, while adhering to all the rules stated in the system prompt. Please be reminded that the original user query was '{initial_text_prompt}'."
+                    messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": sam3_output_text_message}
+                            ],
+                        }
+                    )
+                else:
+                    sam3_output_text_message = rf"The segment_phrase tool generated {num_masks} available masks. All {num_masks} available masks are rendered in this image below, now you must analyze the {num_masks} available mask(s) carefully, compare them against the raw input image and the original user query, and determine your next action. Please be reminded that the original user query was '{initial_text_prompt}'."
+                    messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": sam3_output_text_message},
+                                {"type": "image", "image": sam3_output_image_path},
+                            ],
+                        }
+                    )
+                print("\n\n>>> sam3_output_text_message:\n", sam3_output_text_message)
+        elif tool_call["name"] == "examine_each_mask":
+            print("🔍 Calling examine_each_mask tool...")
+            assert LATEST_SAM3_TEXT_PROMPT != ""
+            # Make sure that the last message is a image
+            assert (
+                messages[-1]["content"][1]["type"] == "image"
+            ), "Second content element should be an image"
+            messages.pop()  # Remove the last user message
+            # Add simplified replacement message
+            simplified_message = {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "The segment_phrase tool generated several masks. Now you must analyze the mask(s) carefully, compare them against the raw input image and the original user query, and determine your next action.",
+                    }
+                ],
+            }
+            messages.append(simplified_message)
+            current_outputs = json.load(open(PATH_TO_LATEST_OUTPUT_JSON, "r"))
+            num_masks = len(current_outputs["pred_masks"])
+            masks_to_keep = []
+            # MLLM check the mask one by one
+            for i in range(num_masks):
+                print(f"🔍 Checking mask {i+1}/{num_masks}...")
+                image_w_mask_i, image_w_zoomed_in_mask_i = visualize(current_outputs, i)
+                image_w_zoomed_in_mask_i_path = os.path.join(
+                    sam_output_dir, rf"{LATEST_SAM3_TEXT_PROMPT}.png".replace("/", "_")
+                ).replace(".png", f"_zoom_in_mask_{i + 1}.png")
+                image_w_mask_i_path = os.path.join(
+                    sam_output_dir, rf"{LATEST_SAM3_TEXT_PROMPT}.png".replace("/", "_")
+                ).replace(".png", f"_selected_mask_{i + 1}.png")
+                image_w_zoomed_in_mask_i.save(image_w_zoomed_in_mask_i_path)
+                image_w_mask_i.save(image_w_mask_i_path)
+                iterative_checking_messages = [
+                    {"role": "system", "content": iterative_checking_system_prompt},
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": f"The raw input image: "},
+                            {"type": "image", "image": img_path},
+                            {
+                                "type": "text",
+                                "text": f"The initial user input query is: '{initial_text_prompt}'",
+                            },
+                            {
+                                "type": "text",
+                                "text": f"Image with the predicted segmentation mask rendered on it: ",
+                            },
+                            {"type": "image", "image": image_w_mask_i_path},
+                            {
+                                "type": "text",
+                                "text": f"Image with the zoomed-in mask: ",
+                            },
+                            {"type": "image", "image": image_w_zoomed_in_mask_i_path},
+                        ],
+                    },
+                ]
+                checking_generated_text = send_generate_request(
+                    iterative_checking_messages
+                )
+                # Process the generated text to determine if the mask should be kept or rejected
+                if checking_generated_text is None:
+                    raise ValueError(
+                        "Generated text is None, which is unexpected. Please check the Qwen server and the input parameters."
+                    )
+                print(f"Generated text for mask {i+1}: {checking_generated_text}")
+                verdict = (
+                    checking_generated_text.split("<verdict>")[-1]
+                    .split("</verdict>")[0]
+                    .strip()
+                )
+                if "Accept" in verdict:
+                    assert not "Reject" in verdict
+                    print(f"Mask {i+1} accepted, keeping it in the outputs.")
+                    masks_to_keep.append(i)
+                elif "Reject" in verdict:
+                    assert not "Accept" in verdict
+                    print(f"Mask {i+1} rejected, removing it from the outputs.")
+                else:
+                    raise ValueError(
+                        f"Unexpected verdict in generated text: {checking_generated_text}. Expected 'Accept' or 'Reject'."
+                    )
+            updated_outputs = {
+                "original_image_path": current_outputs["original_image_path"],
+                "orig_img_h": current_outputs["orig_img_h"],
+                "orig_img_w": current_outputs["orig_img_w"],
+                "pred_boxes": [current_outputs["pred_boxes"][i] for i in masks_to_keep],
+                "pred_scores": [
+                    current_outputs["pred_scores"][i] for i in masks_to_keep
+                ],
+                "pred_masks": [current_outputs["pred_masks"][i] for i in masks_to_keep],
+            }
+            image_w_check_masks = visualize(updated_outputs)
+            image_w_check_masks_path = os.path.join(
+                sam_output_dir, rf"{LATEST_SAM3_TEXT_PROMPT}.png"
+            ).replace(
+                ".png",
+                f"_selected_masks_{'-'.join(map(str, [i+1 for i in masks_to_keep]))}.png".replace(
+                    "/", "_"
+                ),
+            )
+            image_w_check_masks.save(image_w_check_masks_path)
+            # save the updated json outputs and append to message history
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "text": generated_text}],
+                }
+            )
+            if len(masks_to_keep) == 0:
+                messages.append(
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"The original user query was: '{initial_text_prompt}'. The examine_each_mask tool examined and rejected all of the masks generated by the segment_phrase tool. Now, please call the segment_phrase tool again with a different, perhaps more general, or more creative simple noun phrase text_prompt, while adhering to all the rules stated in the system prompt.",
+                            }
+                        ],
+                    }
+                )
+            else:
+                messages.append(
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"The original user query was: '{initial_text_prompt}'. After calling the examine_each_mask tool on the available masks, the number of available masks is now {len(masks_to_keep)}. All {len(masks_to_keep)} available masks are rendered in this image below, now you must analyze the {len(masks_to_keep)} available mask(s) carefully, compare them against the raw input image and the original user query, and determine your next action.",
+                            },
+                            {"type": "image", "image": image_w_check_masks_path},
+                        ],
+                    }
+                )
+            # Create a new filename based on the original path to avoid filename length issues
+            base_path = PATH_TO_LATEST_OUTPUT_JSON
+            # Remove any existing "masks_" suffix to avoid duplication
+            if "masks_" in base_path:
+                base_path = base_path.split("masks_")[0] + ".json"
+            # Create new filename with current masks; use a clearer suffix when empty
+            if len(masks_to_keep) == 0:
+                PATH_TO_LATEST_OUTPUT_JSON = base_path.replace(
+                    ".json", "masks_none.json"
+                )
+            else:
+                PATH_TO_LATEST_OUTPUT_JSON = base_path.replace(
+                    ".json", f"masks_{'_'.join(map(str, masks_to_keep))}.json"
+                )
+            json.dump(updated_outputs, open(PATH_TO_LATEST_OUTPUT_JSON, "w"), indent=4)
+        elif tool_call["name"] == "select_masks_and_return":
+            print("🔍 Calling select_masks_and_return tool...")
+            current_outputs = json.load(open(PATH_TO_LATEST_OUTPUT_JSON, "r"))
+            assert list(tool_call["parameters"].keys()) == ["final_answer_masks"]
+            masks_to_keep = tool_call["parameters"]["final_answer_masks"]
+            # Keep only valid mask indices, remove duplicates, and preserve deterministic ascending order
+            available_masks = set(range(1, len(current_outputs["pred_masks"]) + 1))
+            masks_to_keep = sorted({i for i in masks_to_keep if i in available_masks})
+            # Change this to a update message telling the model to try again along with information about errors made.
+            final_outputs = {
+                "original_image_path": current_outputs["original_image_path"],
+                "orig_img_h": current_outputs["orig_img_h"],
+                "orig_img_w": current_outputs["orig_img_w"],
+                "pred_boxes": [
+                    current_outputs["pred_boxes"][i - 1] for i in masks_to_keep
+                ],
+                "pred_scores": [
+                    current_outputs["pred_scores"][i - 1] for i in masks_to_keep
+                ],
+                "pred_masks": [
+                    current_outputs["pred_masks"][i - 1] for i in masks_to_keep
+                ],
+            }
+            rendered_final_output = visualize(final_outputs)
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "text": generated_text}],
+                }
+            )
+            # Clean up debug files before successful return
+            cleanup_debug_files(debug, debug_folder_path, debug_jsonl_path)
+            return messages, final_outputs, rendered_final_output
+        elif tool_call["name"] == "report_no_mask":
+            print("🔍 Calling report_no_mask tool...")
+            height, width = cv2.imread(img_path).shape[:2]
+            final_outputs = {
+                "original_image_path": img_path,
+                "orig_img_h": height,
+                "orig_img_w": width,
+                "pred_boxes": [],
+                "pred_scores": [],
+                "pred_masks": [],
+            }
+            rendered_final_output = Image.open(img_path)
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "text": generated_text}],
+                }
+            )
+            return messages, final_outputs, rendered_final_output
+        else:
+            raise ValueError(f"Unknown tool call: {tool_call['name']}")
+        # sometimes the MLLM don't know when to stop, and generates multiple tool calls in one round, so we need to split the generated text by </tool> and only keep the first one
+        for message in messages:
+            if message["role"] == "assistant" and "content" in message:
+                for content in message["content"]:
+                    if (
+                        isinstance(content, dict)
+                        and content.get("type") == "text"
+                        and "text" in content
+                    ):
+                        content["text"] = (
+                            content["text"].split("</tool>", 1)[0] + "</tool>\n\n"
+                        )
+        # Prune the messages history before the next MLLM generation round according to the 3-part rules.
+        # This keeps history compact and ensures the model sees only the allowed parts.
+        messages = _prune_messages_for_next_round(
+            messages,
+            USED_TEXT_PROMPTS,
+            LATEST_SAM3_TEXT_PROMPT,
+            img_path,
+            initial_text_prompt,
+        )
+        # make sure there can never be more than 2 images in the context
+        assert count_images(messages) <= 2
+        generation_count += 1
+        if generation_count > max_generations:
+            raise ValueError(
+                f"Exceeded maximum number of allowed generation requests ({max_generations})"
+            )
+        print("\n\n")
+        print("-" * 30 + f" Round {str(generation_count + 1)}" + "-" * 30)
+        print("\n\n")
+        generated_text = send_generate_request(messages)
+        print(
+            f"\n>>> MLLM Response [start]\n{generated_text}\n<<< MLLM Response [end]\n"
+        )
+    print("\n\n>>> SAM 3 Agent execution ended.\n\n")
+    error_save_path = os.path.join(
+        error_save_dir,
+        f"{img_path.rsplit('/', 1)[-1].rsplit('.', 1)[0]}_error_history.json",
+    )
+    with open(error_save_path, "w") as f:
+        json.dump(messages, f, indent=4)
+    print("Saved messages history that caused error to:", error_save_path)
+    raise ValueError(
+        rf"Generated text is None, which is unexpected. Please check the Qwen server and the input parameters for image path: {img_path} and initial text prompt: {initial_text_prompt}."
+    )

sam3/agent/client_llm.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import base64
+import os
+from typing import Any, Optional
+from openai import OpenAI
+def get_image_base64_and_mime(image_path):
+    """Convert image file to base64 string and get MIME type"""
+    try:
+        # Get MIME type based on file extension
+        ext = os.path.splitext(image_path)[1].lower()
+        mime_types = {
+            ".jpg": "image/jpeg",
+            ".jpeg": "image/jpeg",
+            ".png": "image/png",
+            ".gif": "image/gif",
+            ".webp": "image/webp",
+            ".bmp": "image/bmp",
+        }
+        mime_type = mime_types.get(ext, "image/jpeg")  # Default to JPEG
+        # Convert image to base64
+        with open(image_path, "rb") as image_file:
+            base64_data = base64.b64encode(image_file.read()).decode("utf-8")
+            return base64_data, mime_type
+    except Exception as e:
+        print(f"Error converting image to base64: {e}")
+        return None, None
+def send_generate_request(
+    messages,
+    server_url=None,
+    model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    api_key=None,
+    max_tokens=4096,
+):
+    """
+    Sends a request to the OpenAI-compatible API endpoint using the OpenAI client library.
+    Args:
+        server_url (str): The base URL of the server, e.g. "http://127.0.0.1:8000"
+        messages (list): A list of message dicts, each containing role and content.
+        model (str): The model to use for generation (default: "llama-4")
+        max_tokens (int): Maximum number of tokens to generate (default: 4096)
+    Returns:
+        str: The generated response text from the server.
+    """
+    # Process messages to convert image paths to base64
+    processed_messages = []
+    for message in messages:
+        processed_message = message.copy()
+        if message["role"] == "user" and "content" in message:
+            processed_content = []
+            for c in message["content"]:
+                if isinstance(c, dict) and c.get("type") == "image":
+                    # Convert image path to base64 format
+                    image_path = c["image"]
+                    print("image_path", image_path)
+                    new_image_path = image_path.replace(
+                        "?", "%3F"
+                    )  # Escape ? in the path
+                    # Read the image file and convert to base64
+                    try:
+                        base64_image, mime_type = get_image_base64_and_mime(
+                            new_image_path
+                        )
+                        if base64_image is None:
+                            print(
+                                f"Warning: Could not convert image to base64: {new_image_path}"
+                            )
+                            continue
+                        # Create the proper image_url structure with base64 data
+                        processed_content.append(
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{mime_type};base64,{base64_image}",
+                                    "detail": "high",
+                                },
+                            }
+                        )
+                    except FileNotFoundError:
+                        print(f"Warning: Image file not found: {new_image_path}")
+                        continue
+                    except Exception as e:
+                        print(f"Warning: Error processing image {new_image_path}: {e}")
+                        continue
+                else:
+                    processed_content.append(c)
+            processed_message["content"] = processed_content
+        processed_messages.append(processed_message)
+    # Create OpenAI client with custom base URL
+    client = OpenAI(api_key=api_key, base_url=server_url)
+    try:
+        print(f"🔍 Calling model {model}...")
+        response = client.chat.completions.create(
+            model=model,
+            messages=processed_messages,
+            max_completion_tokens=max_tokens,
+            n=1,
+        )
+        # print(f"Received response: {response.choices[0].message}")
+        # Extract the response content
+        if response.choices and len(response.choices) > 0:
+            return response.choices[0].message.content
+        else:
+            print(f"Unexpected response format: {response}")
+            return None
+    except Exception as e:
+        print(f"Request failed: {e}")
+        return None
+def send_direct_request(
+    llm: Any,
+    messages: list[dict[str, Any]],
+    sampling_params: Any,
+) -> Optional[str]:
+    """
+    Run inference on a vLLM model instance directly without using a server.
+    Args:
+        llm: Initialized vLLM LLM instance (passed from external initialization)
+        messages: List of message dicts with role and content (OpenAI format)
+        sampling_params: vLLM SamplingParams instance (initialized externally)
+    Returns:
+        str: Generated response text, or None if inference fails
+    """
+    try:
+        # Process messages to handle images (convert to base64 if needed)
+        processed_messages = []
+        for message in messages:
+            processed_message = message.copy()
+            if message["role"] == "user" and "content" in message:
+                processed_content = []
+                for c in message["content"]:
+                    if isinstance(c, dict) and c.get("type") == "image":
+                        # Convert image path to base64 format
+                        image_path = c["image"]
+                        new_image_path = image_path.replace("?", "%3F")
+                        try:
+                            base64_image, mime_type = get_image_base64_and_mime(
+                                new_image_path
+                            )
+                            if base64_image is None:
+                                print(
+                                    f"Warning: Could not convert image: {new_image_path}"
+                                )
+                                continue
+                            # vLLM expects image_url format
+                            processed_content.append(
+                                {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": f"data:{mime_type};base64,{base64_image}"
+                                    },
+                                }
+                            )
+                        except Exception as e:
+                            print(
+                                f"Warning: Error processing image {new_image_path}: {e}"
+                            )
+                            continue
+                    else:
+                        processed_content.append(c)
+                processed_message["content"] = processed_content
+            processed_messages.append(processed_message)
+        print("🔍 Running direct inference with vLLM...")
+        # Run inference using vLLM's chat interface
+        outputs = llm.chat(
+            messages=processed_messages,
+            sampling_params=sampling_params,
+        )
+        # Extract the generated text from the first output
+        if outputs and len(outputs) > 0:
+            generated_text = outputs[0].outputs[0].text
+            return generated_text
+        else:
+            print(f"Unexpected output format: {outputs}")
+            return None
+    except Exception as e:
+        print(f"Direct inference failed: {e}")
+        return None

sam3/agent/client_sam3.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import json
+import os
+import torch
+from PIL import Image
+from sam3.model.box_ops import box_xyxy_to_xywh
+from sam3.train.masks_ops import rle_encode
+from .helpers.mask_overlap_removal import remove_overlapping_masks
+from .viz import visualize
+def sam3_inference(processor, image_path, text_prompt):
+    """Run SAM 3 image inference with text prompts and format the outputs"""
+    image = Image.open(image_path)
+    orig_img_w, orig_img_h = image.size
+    # model inference
+    inference_state = processor.set_image(image)
+    inference_state = processor.set_text_prompt(
+        state=inference_state, prompt=text_prompt
+    )
+    # format and assemble outputs
+    pred_boxes_xyxy = torch.stack(
+        [
+            inference_state["boxes"][:, 0] / orig_img_w,
+            inference_state["boxes"][:, 1] / orig_img_h,
+            inference_state["boxes"][:, 2] / orig_img_w,
+            inference_state["boxes"][:, 3] / orig_img_h,
+        ],
+        dim=-1,
+    )  # normalized in range [0, 1]
+    pred_boxes_xywh = box_xyxy_to_xywh(pred_boxes_xyxy).tolist()
+    pred_masks = rle_encode(inference_state["masks"].squeeze(1))
+    pred_masks = [m["counts"] for m in pred_masks]
+    outputs = {
+        "orig_img_h": orig_img_h,
+        "orig_img_w": orig_img_w,
+        "pred_boxes": pred_boxes_xywh,
+        "pred_masks": pred_masks,
+        "pred_scores": inference_state["scores"].tolist(),
+    }
+    return outputs
+def call_sam_service(
+    sam3_processor,
+    image_path: str,
+    text_prompt: str,
+    output_folder_path: str = "sam3_output",
+):
+    """
+    Loads an image, sends it with a text prompt to the service,
+    saves the results, and renders the visualization.
+    """
+    print(f"📞 Loading image '{image_path}' and sending with prompt '{text_prompt}'...")
+    text_prompt_for_save_path = (
+        text_prompt.replace("/", "_") if "/" in text_prompt else text_prompt
+    )
+    os.makedirs(
+        os.path.join(output_folder_path, image_path.replace("/", "-")), exist_ok=True
+    )
+    output_json_path = os.path.join(
+        output_folder_path,
+        image_path.replace("/", "-"),
+        rf"{text_prompt_for_save_path}.json",
+    )
+    output_image_path = os.path.join(
+        output_folder_path,
+        image_path.replace("/", "-"),
+        rf"{text_prompt_for_save_path}.png",
+    )
+    try:
+        # Send the image and text prompt as a multipart/form-data request
+        serialized_response = sam3_inference(sam3_processor, image_path, text_prompt)
+        # 1. Prepare the response dictionary
+        serialized_response = remove_overlapping_masks(serialized_response)
+        serialized_response = {
+            "original_image_path": image_path,
+            "output_image_path": output_image_path,
+            **serialized_response,
+        }
+        # 2. Reorder predictions by scores (highest to lowest) if scores are available
+        if "pred_scores" in serialized_response and serialized_response["pred_scores"]:
+            # Create indices sorted by scores in descending order
+            score_indices = sorted(
+                range(len(serialized_response["pred_scores"])),
+                key=lambda i: serialized_response["pred_scores"][i],
+                reverse=True,
+            )
+            # Reorder all three lists based on the sorted indices
+            serialized_response["pred_scores"] = [
+                serialized_response["pred_scores"][i] for i in score_indices
+            ]
+            serialized_response["pred_boxes"] = [
+                serialized_response["pred_boxes"][i] for i in score_indices
+            ]
+            serialized_response["pred_masks"] = [
+                serialized_response["pred_masks"][i] for i in score_indices
+            ]
+        # 3. Remove any invalid RLE masks that is too short (shorter than 5 characters)
+        valid_masks = []
+        valid_boxes = []
+        valid_scores = []
+        for i, rle in enumerate(serialized_response["pred_masks"]):
+            if len(rle) > 4:
+                valid_masks.append(rle)
+                valid_boxes.append(serialized_response["pred_boxes"][i])
+                valid_scores.append(serialized_response["pred_scores"][i])
+        serialized_response["pred_masks"] = valid_masks
+        serialized_response["pred_boxes"] = valid_boxes
+        serialized_response["pred_scores"] = valid_scores
+        with open(output_json_path, "w") as f:
+            json.dump(serialized_response, f, indent=4)
+        print(f"✅ Raw JSON response saved to '{output_json_path}'")
+        # 4. Render and save visualizations on the image and save it in the SAM3 output folder
+        print("🔍 Rendering visualizations on the image ...")
+        viz_image = visualize(serialized_response)
+        os.makedirs(os.path.dirname(output_image_path), exist_ok=True)
+        viz_image.save(output_image_path)
+        print("✅ Saved visualization at:", output_image_path)
+    except Exception as e:
+        print(f"❌ Error calling service: {e}")
+    return output_json_path

sam3/agent/helpers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved

sam3/agent/helpers/boxes.py ADDED Viewed

	@@ -0,0 +1,438 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import math
+from enum import IntEnum, unique
+from typing import List, Tuple, Union
+import numpy as np
+import torch
+from torch import device
+_RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray]
+@unique
+class BoxMode(IntEnum):
+    """
+    Enum of different ways to represent a box.
+    """
+    XYXY_ABS = 0
+    """
+    (x0, y0, x1, y1) in absolute floating points coordinates.
+    The coordinates in range [0, width or height].
+    """
+    XYWH_ABS = 1
+    """
+    (x0, y0, w, h) in absolute floating points coordinates.
+    """
+    XYXY_REL = 2
+    """
+    Not yet supported!
+    (x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image.
+    """
+    XYWH_REL = 3
+    """
+    Not yet supported!
+    (x0, y0, w, h) in range [0, 1]. They are relative to the size of the image.
+    """
+    XYWHA_ABS = 4
+    """
+    (xc, yc, w, h, a) in absolute floating points coordinates.
+    (xc, yc) is the center of the rotated box, and the angle a is in degrees ccw.
+    """
+    @staticmethod
+    def convert(
+        box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode"
+    ) -> _RawBoxType:
+        """
+        Args:
+            box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5
+            from_mode, to_mode (BoxMode)
+        Returns:
+            The converted box of the same type.
+        """
+        if from_mode == to_mode:
+            return box
+        original_type = type(box)
+        is_numpy = isinstance(box, np.ndarray)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) == 4 or len(box) == 5, (
+                "BoxMode.convert takes either a k-tuple/list or an Nxk array/tensor,"
+                " where k == 4 or 5"
+            )
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            else:
+                arr = box.clone()
+        assert to_mode not in [
+            BoxMode.XYXY_REL,
+            BoxMode.XYWH_REL,
+        ] and from_mode not in [
+            BoxMode.XYXY_REL,
+            BoxMode.XYWH_REL,
+        ], "Relative mode not yet supported!"
+        if from_mode == BoxMode.XYWHA_ABS and to_mode == BoxMode.XYXY_ABS:
+            assert (
+                arr.shape[-1] == 5
+            ), "The last dimension of input shape must be 5 for XYWHA format"
+            original_dtype = arr.dtype
+            arr = arr.double()
+            w = arr[:, 2]
+            h = arr[:, 3]
+            a = arr[:, 4]
+            c = torch.abs(torch.cos(a * math.pi / 180.0))
+            s = torch.abs(torch.sin(a * math.pi / 180.0))
+            # This basically computes the horizontal bounding rectangle of the rotated box
+            new_w = c * w + s * h
+            new_h = c * h + s * w
+            # convert center to top-left corner
+            arr[:, 0] -= new_w / 2.0
+            arr[:, 1] -= new_h / 2.0
+            # bottom-right corner
+            arr[:, 2] = arr[:, 0] + new_w
+            arr[:, 3] = arr[:, 1] + new_h
+            arr = arr[:, :4].to(dtype=original_dtype)
+        elif from_mode == BoxMode.XYWH_ABS and to_mode == BoxMode.XYWHA_ABS:
+            original_dtype = arr.dtype
+            arr = arr.double()
+            arr[:, 0] += arr[:, 2] / 2.0
+            arr[:, 1] += arr[:, 3] / 2.0
+            angles = torch.zeros((arr.shape[0], 1), dtype=arr.dtype)
+            arr = torch.cat((arr, angles), axis=1).to(dtype=original_dtype)
+        else:
+            if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS:
+                arr[:, 2] += arr[:, 0]
+                arr[:, 3] += arr[:, 1]
+            elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS:
+                arr[:, 2] -= arr[:, 0]
+                arr[:, 3] -= arr[:, 1]
+            else:
+                raise NotImplementedError(
+                    "Conversion from BoxMode {} to {} is not supported yet".format(
+                        from_mode, to_mode
+                    )
+                )
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        else:
+            return arr
+class Boxes:
+    """
+    This structure stores a list of boxes as a Nx4 torch.Tensor.
+    It supports some common methods about boxes
+    (`area`, `clip`, `nonempty`, etc),
+    and also behaves like a Tensor
+    (support indexing, `to(device)`, `.device`, and iteration over all boxes)
+    Attributes:
+        tensor (torch.Tensor): float matrix of Nx4. Each row is (x1, y1, x2, y2).
+    """
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor (Tensor[float]): a Nx4 matrix.  Each row is (x1, y1, x2, y2).
+        """
+        if not isinstance(tensor, torch.Tensor):
+            tensor = torch.as_tensor(
+                tensor, dtype=torch.float32, device=torch.device("cpu")
+            )
+        else:
+            tensor = tensor.to(torch.float32)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does not depend on
+            # the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((-1, 4)).to(dtype=torch.float32)
+        assert tensor.dim() == 2 and tensor.size(-1) == 4, tensor.size()
+        self.tensor = tensor
+    def clone(self) -> "Boxes":
+        """
+        Clone the Boxes.
+        Returns:
+            Boxes
+        """
+        return Boxes(self.tensor.clone())
+    def to(self, device: torch.device):
+        # Boxes are assumed float32 and does not support to(dtype)
+        return Boxes(self.tensor.to(device=device))
+    def area(self) -> torch.Tensor:
+        """
+        Computes the area of all the boxes.
+        Returns:
+            torch.Tensor: a vector with areas of each box.
+        """
+        box = self.tensor
+        area = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])
+        return area
+    def clip(self, box_size: Tuple[int, int]) -> None:
+        """
+        Clip (in place) the boxes by limiting x coordinates to the range [0, width]
+        and y coordinates to the range [0, height].
+        Args:
+            box_size (height, width): The clipping box's size.
+        """
+        assert torch.isfinite(self.tensor).all(), "Box tensor contains infinite or NaN!"
+        h, w = box_size
+        x1 = self.tensor[:, 0].clamp(min=0, max=w)
+        y1 = self.tensor[:, 1].clamp(min=0, max=h)
+        x2 = self.tensor[:, 2].clamp(min=0, max=w)
+        y2 = self.tensor[:, 3].clamp(min=0, max=h)
+        self.tensor = torch.stack((x1, y1, x2, y2), dim=-1)
+    def nonempty(self, threshold: float = 0.0) -> torch.Tensor:
+        """
+        Find boxes that are non-empty.
+        A box is considered empty, if either of its side is no larger than threshold.
+        Returns:
+            Tensor:
+                a binary vector which represents whether each box is empty
+                (False) or non-empty (True).
+        """
+        box = self.tensor
+        widths = box[:, 2] - box[:, 0]
+        heights = box[:, 3] - box[:, 1]
+        keep = (widths > threshold) & (heights > threshold)
+        return keep
+    def __getitem__(self, item) -> "Boxes":
+        """
+        Args:
+            item: int, slice, or a BoolTensor
+        Returns:
+            Boxes: Create a new :class:`Boxes` by indexing.
+        The following usage are allowed:
+        1. `new_boxes = boxes[3]`: return a `Boxes` which contains only one box.
+        2. `new_boxes = boxes[2:10]`: return a slice of boxes.
+        3. `new_boxes = boxes[vector]`, where vector is a torch.BoolTensor
+           with `length = len(boxes)`. Nonzero elements in the vector will be selected.
+        Note that the returned Boxes might share storage with this Boxes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return Boxes(self.tensor[item].view(1, -1))
+        b = self.tensor[item]
+        assert (
+            b.dim() == 2
+        ), "Indexing on Boxes with {} failed to return a matrix!".format(item)
+        return Boxes(b)
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+    def __repr__(self) -> str:
+        return "Boxes(" + str(self.tensor) + ")"
+    def inside_box(
+        self, box_size: Tuple[int, int], boundary_threshold: int = 0
+    ) -> torch.Tensor:
+        """
+        Args:
+            box_size (height, width): Size of the reference box.
+            boundary_threshold (int): Boxes that extend beyond the reference box
+                boundary by more than boundary_threshold are considered "outside".
+        Returns:
+            a binary vector, indicating whether each box is inside the reference box.
+        """
+        height, width = box_size
+        inds_inside = (
+            (self.tensor[..., 0] >= -boundary_threshold)
+            & (self.tensor[..., 1] >= -boundary_threshold)
+            & (self.tensor[..., 2] < width + boundary_threshold)
+            & (self.tensor[..., 3] < height + boundary_threshold)
+        )
+        return inds_inside
+    def get_centers(self) -> torch.Tensor:
+        """
+        Returns:
+            The box centers in a Nx2 array of (x, y).
+        """
+        return (self.tensor[:, :2] + self.tensor[:, 2:]) / 2
+    def scale(self, scale_x: float, scale_y: float) -> None:
+        """
+        Scale the box with horizontal and vertical scaling factors
+        """
+        self.tensor[:, 0::2] *= scale_x
+        self.tensor[:, 1::2] *= scale_y
+    @classmethod
+    def cat(cls, boxes_list: List["Boxes"]) -> "Boxes":
+        """
+        Concatenates a list of Boxes into a single Boxes
+        Arguments:
+            boxes_list (list[Boxes])
+        Returns:
+            Boxes: the concatenated Boxes
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all([isinstance(box, Boxes) for box in boxes_list])
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0))
+        return cat_boxes
+    @property
+    def device(self) -> device:
+        return self.tensor.device
+    # type "Iterator[torch.Tensor]", yield, and iter() not supported by torchscript
+    # https://github.com/pytorch/pytorch/issues/18627
+    @torch.jit.unused
+    def __iter__(self):
+        """
+        Yield a box as a Tensor of shape (4,) at a time.
+        """
+        yield from self.tensor
+def pairwise_intersection(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Given two lists of boxes of size N and M,
+    compute the intersection area between __all__ N x M pairs of boxes.
+    The box order must be (xmin, ymin, xmax, ymax)
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+    Returns:
+        Tensor: intersection, sized [N,M].
+    """
+    boxes1, boxes2 = boxes1.tensor, boxes2.tensor
+    width_height = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) - torch.max(
+        boxes1[:, None, :2], boxes2[:, :2]
+    )  # [N,M,2]
+    width_height.clamp_(min=0)  # [N,M,2]
+    intersection = width_height.prod(dim=2)  # [N,M]
+    return intersection
+# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
+# with slight modifications
+def pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Given two lists of boxes of size N and M, compute the IoU
+    (intersection over union) between **all** N x M pairs of boxes.
+    The box order must be (xmin, ymin, xmax, ymax).
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+    area1 = boxes1.area()  # [N]
+    area2 = boxes2.area()  # [M]
+    inter = pairwise_intersection(boxes1, boxes2)
+    # handle empty boxes
+    iou = torch.where(
+        inter > 0,
+        inter / (area1[:, None] + area2 - inter),
+        torch.zeros(1, dtype=inter.dtype, device=inter.device),
+    )
+    return iou
+def pairwise_ioa(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Similar to :func:`pariwise_iou` but compute the IoA (intersection over boxes2 area).
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+    Returns:
+        Tensor: IoA, sized [N,M].
+    """
+    area2 = boxes2.area()  # [M]
+    inter = pairwise_intersection(boxes1, boxes2)
+    # handle empty boxes
+    ioa = torch.where(
+        inter > 0, inter / area2, torch.zeros(1, dtype=inter.dtype, device=inter.device)
+    )
+    return ioa
+def pairwise_point_box_distance(points: torch.Tensor, boxes: Boxes):
+    """
+    Pairwise distance between N points and M boxes. The distance between a
+    point and a box is represented by the distance from the point to 4 edges
+    of the box. Distances are all positive when the point is inside the box.
+    Args:
+        points: Nx2 coordinates. Each row is (x, y)
+        boxes: M boxes
+    Returns:
+        Tensor: distances of size (N, M, 4). The 4 values are distances from
+            the point to the left, top, right, bottom of the box.
+    """
+    x, y = points.unsqueeze(dim=2).unbind(dim=1)  # (N, 1)
+    x0, y0, x1, y1 = boxes.tensor.unsqueeze(dim=0).unbind(dim=2)  # (1, M)
+    return torch.stack([x - x0, y - y0, x1 - x, y1 - y], dim=2)
+def matched_pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Compute pairwise intersection over union (IOU) of two sets of matched
+    boxes that have the same number of boxes.
+    Similar to :func:`pairwise_iou`, but computes only diagonal elements of the matrix.
+    Args:
+        boxes1 (Boxes): bounding boxes, sized [N,4].
+        boxes2 (Boxes): same length as boxes1
+    Returns:
+        Tensor: iou, sized [N].
+    """
+    assert len(boxes1) == len(boxes2), (
+        "boxlists should have the same" "number of entries, got {}, {}".format(
+            len(boxes1), len(boxes2)
+        )
+    )
+    area1 = boxes1.area()  # [N]
+    area2 = boxes2.area()  # [N]
+    box1, box2 = boxes1.tensor, boxes2.tensor
+    lt = torch.max(box1[:, :2], box2[:, :2])  # [N,2]
+    rb = torch.min(box1[:, 2:], box2[:, 2:])  # [N,2]
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+    iou = inter / (area1 + area2 - inter)  # [N]
+    return iou

sam3/agent/helpers/color_map.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""
+An awesome colormap for really neat visualizations.
+Copied from Detectron, and removed gray colors.
+"""
+import random
+import numpy as np
+__all__ = ["colormap", "random_color", "random_colors"]
+# A list of 25 bright and sharp colors for segmentation masks,
+# generated from the edges of the sRGB color space for maximum intensity.
+_COLORS = (
+    np.array(
+        [
+            # The original 8 sharp colors
+            1.000,
+            1.000,
+            0.000,  # 1. Yellow
+            0.000,
+            1.000,
+            0.000,  # 2. Lime
+            0.000,
+            1.000,
+            1.000,  # 3. Cyan
+            1.000,
+            0.000,
+            1.000,  # 4. Magenta
+            1.000,
+            0.000,
+            0.000,  # 5. Red
+            1.000,
+            0.498,
+            0.000,  # 6. Orange
+            0.498,
+            1.000,
+            0.000,  # 7. Chartreuse
+            0.000,
+            1.000,
+            0.498,  # 8. Spring Green
+            1.000,
+            0.000,
+            0.498,  # 9. Rose
+            0.498,
+            0.000,
+            1.000,  # 10. Violet
+            0.753,
+            1.000,
+            0.000,  # 11. Electric Lime
+            1.000,
+            0.753,
+            0.000,  # 12. Vivid Orange
+            0.000,
+            1.000,
+            0.753,  # 13. Turquoise
+            0.753,
+            0.000,
+            1.000,  # 14. Bright Violet
+            1.000,
+            0.000,
+            0.753,  # 15. Bright Pink
+            1.000,
+            0.251,
+            0.000,  # 16. Fiery Orange
+            0.251,
+            1.000,
+            0.000,  # 17. Bright Chartreuse
+            0.000,
+            1.000,
+            0.251,  # 18. Malachite Green
+            0.251,
+            0.000,
+            1.000,  # 19. Deep Violet
+            1.000,
+            0.000,
+            0.251,  # 20. Hot Pink
+        ]
+    )
+    .astype(np.float32)
+    .reshape(-1, 3)
+)
+def colormap(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+    Returns:
+        ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1]
+    """
+    assert maximum in [255, 1], maximum
+    c = _COLORS * maximum
+    if not rgb:
+        c = c[:, ::-1]
+    return c
+def random_color(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+    Returns:
+        ndarray: a vector of 3 numbers
+    """
+    idx = np.random.randint(0, len(_COLORS))
+    ret = _COLORS[idx] * maximum
+    if not rgb:
+        ret = ret[::-1]
+    return ret
+def random_colors(N, rgb=False, maximum=255):
+    """
+    Args:
+        N (int): number of unique colors needed
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+    Returns:
+        ndarray: a list of random_color
+    """
+    indices = random.sample(range(len(_COLORS)), N)
+    ret = [_COLORS[i] * maximum for i in indices]
+    if not rgb:
+        ret = [x[::-1] for x in ret]
+    return ret
+if __name__ == "__main__":
+    import cv2
+    size = 100
+    H, W = 10, 10
+    canvas = np.random.rand(H * size, W * size, 3).astype("float32")
+    for h in range(H):
+        for w in range(W):
+            idx = h * W + w
+            if idx >= len(_COLORS):
+                break
+            canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx]
+    cv2.imshow("a", canvas)
+    cv2.waitKey(0)

sam3/agent/helpers/keypoints.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+from typing import Any, List, Tuple, Union
+import numpy as np
+import torch
+from torch.nn import functional as F
+class Keypoints:
+    """
+    Stores keypoint **annotation** data. GT Instances have a `gt_keypoints` property
+    containing the x,y location and visibility flag of each keypoint. This tensor has shape
+    (N, K, 3) where N is the number of instances and K is the number of keypoints per instance.
+    The visibility flag follows the COCO format and must be one of three integers:
+    * v=0: not labeled (in which case x=y=0)
+    * v=1: labeled but not visible
+    * v=2: labeled and visible
+    """
+    def __init__(self, keypoints: Union[torch.Tensor, np.ndarray, List[List[float]]]):
+        """
+        Arguments:
+            keypoints: A Tensor, numpy array, or list of the x, y, and visibility of each keypoint.
+                The shape should be (N, K, 3) where N is the number of
+                instances, and K is the number of keypoints per instance.
+        """
+        device = (
+            keypoints.device
+            if isinstance(keypoints, torch.Tensor)
+            else torch.device("cpu")
+        )
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device)
+        assert keypoints.dim() == 3 and keypoints.shape[2] == 3, keypoints.shape
+        self.tensor = keypoints
+    def __len__(self) -> int:
+        return self.tensor.size(0)
+    def to(self, *args: Any, **kwargs: Any) -> "Keypoints":
+        return type(self)(self.tensor.to(*args, **kwargs))
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+    def to_heatmap(self, boxes: torch.Tensor, heatmap_size: int) -> torch.Tensor:
+        """
+        Convert keypoint annotations to a heatmap of one-hot labels for training,
+        as described in :paper:`Mask R-CNN`.
+        Arguments:
+            boxes: Nx4 tensor, the boxes to draw the keypoints to
+        Returns:
+            heatmaps:
+                A tensor of shape (N, K), each element is integer spatial label
+                in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
+            valid:
+                A tensor of shape (N, K) containing whether each keypoint is in the roi or not.
+        """
+        return _keypoints_to_heatmap(self.tensor, boxes, heatmap_size)
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Keypoints":
+        """
+        Create a new `Keypoints` by indexing on this `Keypoints`.
+        The following usage are allowed:
+        1. `new_kpts = kpts[3]`: return a `Keypoints` which contains only one instance.
+        2. `new_kpts = kpts[2:10]`: return a slice of key points.
+        3. `new_kpts = kpts[vector]`, where vector is a torch.ByteTensor
+           with `length = len(kpts)`. Nonzero elements in the vector will be selected.
+        Note that the returned Keypoints might share storage with this Keypoints,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return Keypoints([self.tensor[item]])
+        return Keypoints(self.tensor[item])
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+    @staticmethod
+    def cat(keypoints_list: List["Keypoints"]) -> "Keypoints":
+        """
+        Concatenates a list of Keypoints into a single Keypoints
+        Arguments:
+            keypoints_list (list[Keypoints])
+        Returns:
+            Keypoints: the concatenated Keypoints
+        """
+        assert isinstance(keypoints_list, (list, tuple))
+        assert len(keypoints_list) > 0
+        assert all(isinstance(keypoints, Keypoints) for keypoints in keypoints_list)
+        cat_kpts = type(keypoints_list[0])(
+            torch.cat([kpts.tensor for kpts in keypoints_list], dim=0)
+        )
+        return cat_kpts
+def _keypoints_to_heatmap(
+    keypoints: torch.Tensor, rois: torch.Tensor, heatmap_size: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Encode keypoint locations into a target heatmap for use in SoftmaxWithLoss across space.
+    Maps keypoints from the half-open interval [x1, x2) on continuous image coordinates to the
+    closed interval [0, heatmap_size - 1] on discrete image coordinates. We use the
+    continuous-discrete conversion from Heckbert 1990 ("What is the coordinate of a pixel?"):
+    d = floor(c) and c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
+    Arguments:
+        keypoints: tensor of keypoint locations in of shape (N, K, 3).
+        rois: Nx4 tensor of rois in xyxy format
+        heatmap_size: integer side length of square heatmap.
+    Returns:
+        heatmaps: A tensor of shape (N, K) containing an integer spatial label
+            in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
+        valid: A tensor of shape (N, K) containing whether each keypoint is in
+            the roi or not.
+    """
+    if rois.numel() == 0:
+        return rois.new().long(), rois.new().long()
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+    scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
+    scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
+    offset_x = offset_x[:, None]
+    offset_y = offset_y[:, None]
+    scale_x = scale_x[:, None]
+    scale_y = scale_y[:, None]
+    x = keypoints[..., 0]
+    y = keypoints[..., 1]
+    x_boundary_inds = x == rois[:, 2][:, None]
+    y_boundary_inds = y == rois[:, 3][:, None]
+    x = (x - offset_x) * scale_x
+    x = x.floor().long()
+    y = (y - offset_y) * scale_y
+    y = y.floor().long()
+    x[x_boundary_inds] = heatmap_size - 1
+    y[y_boundary_inds] = heatmap_size - 1
+    valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
+    vis = keypoints[..., 2] > 0
+    valid = (valid_loc & vis).long()
+    lin_ind = y * heatmap_size + x
+    heatmaps = lin_ind * valid
+    return heatmaps, valid
+@torch.jit.script_if_tracing
+def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+    """
+    Extract predicted keypoint locations from heatmaps.
+    Args:
+        maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for
+            each ROI and each keypoint.
+        rois (Tensor): (#ROIs, 4). The box of each ROI.
+    Returns:
+        Tensor of shape (#ROIs, #keypoints, 4) with the last dimension corresponding to
+        (x, y, logit, score) for each keypoint.
+    When converting discrete pixel indices in an NxN image to a continuous keypoint coordinate,
+    we maintain consistency with :meth:`Keypoints.to_heatmap` by using the conversion from
+    Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
+    """
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+    widths = (rois[:, 2] - rois[:, 0]).clamp(min=1)
+    heights = (rois[:, 3] - rois[:, 1]).clamp(min=1)
+    widths_ceil = widths.ceil()
+    heights_ceil = heights.ceil()
+    num_rois, num_keypoints = maps.shape[:2]
+    xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4)
+    width_corrections = widths / widths_ceil
+    height_corrections = heights / heights_ceil
+    keypoints_idx = torch.arange(num_keypoints, device=maps.device)
+    for i in range(num_rois):
+        outsize = (int(heights_ceil[i]), int(widths_ceil[i]))
+        roi_map = F.interpolate(
+            maps[[i]], size=outsize, mode="bicubic", align_corners=False
+        )
+        # Although semantically equivalent, `reshape` is used instead of `squeeze` due
+        # to limitation during ONNX export of `squeeze` in scripting mode
+        roi_map = roi_map.reshape(roi_map.shape[1:])  # keypoints x H x W
+        # softmax over the spatial region
+        max_score, _ = roi_map.view(num_keypoints, -1).max(1)
+        max_score = max_score.view(num_keypoints, 1, 1)
+        tmp_full_resolution = (roi_map - max_score).exp_()
+        tmp_pool_resolution = (maps[i] - max_score).exp_()
+        # Produce scores over the region H x W, but normalize with POOL_H x POOL_W,
+        # so that the scores of objects of different absolute sizes will be more comparable
+        roi_map_scores = tmp_full_resolution / tmp_pool_resolution.sum(
+            (1, 2), keepdim=True
+        )
+        w = roi_map.shape[2]
+        pos = roi_map.view(num_keypoints, -1).argmax(1)
+        x_int = pos % w
+        y_int = (pos - x_int) // w
+        assert (
+            roi_map_scores[keypoints_idx, y_int, x_int]
+            == roi_map_scores.view(num_keypoints, -1).max(1)[0]
+        ).all()
+        x = (x_int.float() + 0.5) * width_corrections[i]
+        y = (y_int.float() + 0.5) * height_corrections[i]
+        xy_preds[i, :, 0] = x + offset_x[i]
+        xy_preds[i, :, 1] = y + offset_y[i]
+        xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int]
+        xy_preds[i, :, 3] = roi_map_scores[keypoints_idx, y_int, x_int]
+    return xy_preds

sam3/agent/helpers/mask_overlap_removal.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+from typing import Dict, List
+import numpy as np
+import torch
+try:
+    from pycocotools import mask as mask_utils
+except Exception:
+    mask_utils = None
+def mask_intersection(
+    masks1: torch.Tensor, masks2: torch.Tensor, block_size: int = 16
+) -> torch.Tensor:
+    assert masks1.shape[1:] == masks2.shape[1:]
+    assert masks1.dtype == torch.bool and masks2.dtype == torch.bool
+    N, M = masks1.shape[0], masks2.shape[0]
+    out = torch.zeros(N, M, device=masks1.device, dtype=torch.long)
+    for i in range(0, N, block_size):
+        for j in range(0, M, block_size):
+            a = masks1[i : i + block_size]
+            b = masks2[j : j + block_size]
+            inter = (a[:, None] & b[None, :]).flatten(-2).sum(-1)
+            out[i : i + block_size, j : j + block_size] = inter
+    return out
+def mask_iom(masks1: torch.Tensor, masks2: torch.Tensor) -> torch.Tensor:
+    assert masks1.shape[1:] == masks2.shape[1:]
+    assert masks1.dtype == torch.bool and masks2.dtype == torch.bool
+    inter = mask_intersection(masks1, masks2)
+    area1 = masks1.flatten(-2).sum(-1)  # (N,)
+    area2 = masks2.flatten(-2).sum(-1)  # (M,)
+    min_area = torch.min(area1[:, None], area2[None, :]).clamp_min(1)
+    return inter.float() / (min_area.float() + 1e-8)
+def _decode_single_mask(mask_repr, h: int, w: int) -> np.ndarray:
+    if isinstance(mask_repr, (list, tuple, np.ndarray)):
+        arr = np.array(mask_repr)
+        if arr.ndim != 2:
+            raise ValueError("Mask array must be 2D (H, W).")
+        return (arr > 0).astype(np.uint8)
+    if mask_utils is None:
+        raise ImportError(
+            "pycocotools is required to decode RLE mask strings. pip install pycocotools"
+        )
+    if not isinstance(mask_repr, (str, bytes)):
+        raise ValueError("Unsupported mask representation type for RLE decode.")
+    rle = {
+        "counts": mask_repr if isinstance(mask_repr, (str, bytes)) else str(mask_repr),
+        "size": [h, w],
+    }
+    decoded = mask_utils.decode(rle)
+    if decoded.ndim == 3:
+        decoded = decoded[:, :, 0]
+    return (decoded > 0).astype(np.uint8)
+def _decode_masks_to_torch_bool(pred_masks: List, h: int, w: int) -> torch.Tensor:
+    bin_masks = [_decode_single_mask(m, h, w) for m in pred_masks]
+    masks_np = np.stack(bin_masks, axis=0).astype(np.uint8)  # (N, H, W)
+    return torch.from_numpy(masks_np > 0)
+def remove_overlapping_masks(sample: Dict, iom_thresh: float = 0.3) -> Dict:
+    """
+    Greedy keep: sort by score desc; keep a mask if IoM to all kept masks <= threshold.
+    If pred_masks has length 0 or 1, returns sample unchanged (no extra keys).
+    """
+    # Basic presence checks
+    if "pred_masks" not in sample or not isinstance(sample["pred_masks"], list):
+        return sample  # nothing to do / preserve as-is
+    pred_masks = sample["pred_masks"]
+    N = len(pred_masks)
+    # --- Early exit: 0 or 1 mask -> do NOT modify the JSON at all ---
+    if N <= 1:
+        return sample
+    # From here on we have at least 2 masks
+    h = int(sample["orig_img_h"])
+    w = int(sample["orig_img_w"])
+    pred_scores = sample.get("pred_scores", [1.0] * N)  # fallback if scores missing
+    pred_boxes = sample.get("pred_boxes", None)
+    assert N == len(pred_scores), "pred_masks and pred_scores must have same length"
+    if pred_boxes is not None:
+        assert N == len(pred_boxes), "pred_masks and pred_boxes must have same length"
+    masks_bool = _decode_masks_to_torch_bool(pred_masks, h, w)  # (N, H, W)
+    order = sorted(range(N), key=lambda i: float(pred_scores[i]), reverse=True)
+    kept_idx: List[int] = []
+    kept_masks: List[torch.Tensor] = []
+    for i in order:
+        cand = masks_bool[i].unsqueeze(0)  # (1, H, W)
+        if len(kept_masks) == 0:
+            kept_idx.append(i)
+            kept_masks.append(masks_bool[i])
+            continue
+        kept_stack = torch.stack(kept_masks, dim=0)  # (K, H, W)
+        iom_vals = mask_iom(cand, kept_stack).squeeze(0)  # (K,)
+        if torch.any(iom_vals > iom_thresh):
+            continue  # overlaps too much with a higher-scored kept mask
+        kept_idx.append(i)
+        kept_masks.append(masks_bool[i])
+    kept_idx_sorted = sorted(kept_idx)
+    # Build filtered JSON (this *does* modify fields; only for N>=2 case)
+    out = dict(sample)
+    out["pred_masks"] = [pred_masks[i] for i in kept_idx_sorted]
+    out["pred_scores"] = [pred_scores[i] for i in kept_idx_sorted]
+    if pred_boxes is not None:
+        out["pred_boxes"] = [pred_boxes[i] for i in kept_idx_sorted]
+    out["kept_indices"] = kept_idx_sorted
+    out["removed_indices"] = [i for i in range(N) if i not in set(kept_idx_sorted)]
+    out["iom_threshold"] = float(iom_thresh)
+    return out

sam3/agent/helpers/masks.py ADDED Viewed

	@@ -0,0 +1,560 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import copy
+import itertools
+from typing import Any, Iterator, List, Union
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from torch import device
+from .boxes import Boxes
+from .memory import retry_if_cuda_oom
+from .roi_align import ROIAlign
+def polygon_area(x, y):
+    # Using the shoelace formula
+    # https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+    return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+def polygons_to_bitmask(
+    polygons: List[np.ndarray], height: int, width: int
+) -> np.ndarray:
+    """
+    Args:
+        polygons (list[ndarray]): each array has shape (Nx2,)
+        height, width (int)
+    Returns:
+        ndarray: a bool mask of shape (height, width)
+    """
+    if len(polygons) == 0:
+        # COCOAPI does not support empty polygons
+        return np.zeros((height, width)).astype(bool)
+    rles = mask_util.frPyObjects(polygons, height, width)
+    rle = mask_util.merge(rles)
+    return mask_util.decode(rle).astype(bool)
+def rasterize_polygons_within_box(
+    polygons: List[np.ndarray], box: np.ndarray, mask_size: int
+) -> torch.Tensor:
+    """
+    Rasterize the polygons into a mask image and
+    crop the mask content in the given box.
+    The cropped mask is resized to (mask_size, mask_size).
+    This function is used when generating training targets for mask head in Mask R-CNN.
+    Given original ground-truth masks for an image, new ground-truth mask
+    training targets in the size of `mask_size x mask_size`
+    must be provided for each predicted box. This function will be called to
+    produce such targets.
+    Args:
+        polygons (list[ndarray[float]]): a list of polygons, which represents an instance.
+        box: 4-element numpy array
+        mask_size (int):
+    Returns:
+        Tensor: BoolTensor of shape (mask_size, mask_size)
+    """
+    # 1. Shift the polygons w.r.t the boxes
+    w, h = box[2] - box[0], box[3] - box[1]
+    polygons = copy.deepcopy(polygons)
+    for p in polygons:
+        p[0::2] = p[0::2] - box[0]
+        p[1::2] = p[1::2] - box[1]
+    # 2. Rescale the polygons to the new box size
+    # max() to avoid division by small number
+    ratio_h = mask_size / max(h, 0.1)
+    ratio_w = mask_size / max(w, 0.1)
+    if ratio_h == ratio_w:
+        for p in polygons:
+            p *= ratio_h
+    else:
+        for p in polygons:
+            p[0::2] *= ratio_w
+            p[1::2] *= ratio_h
+    # 3. Rasterize the polygons with coco api
+    mask = polygons_to_bitmask(polygons, mask_size, mask_size)
+    mask = torch.from_numpy(mask)
+    return mask
+class BitMasks:
+    """
+    This class stores the segmentation masks for all objects in one image, in
+    the form of bitmaps.
+    Attributes:
+        tensor: bool Tensor of N,H,W, representing N instances in the image.
+    """
+    def __init__(self, tensor: Union[torch.Tensor, np.ndarray]):
+        """
+        Args:
+            tensor: bool Tensor of N,H,W, representing N instances in the image.
+        """
+        if isinstance(tensor, torch.Tensor):
+            tensor = tensor.to(torch.bool)
+        else:
+            tensor = torch.as_tensor(
+                tensor, dtype=torch.bool, device=torch.device("cpu")
+            )
+        assert tensor.dim() == 3, tensor.size()
+        self.image_size = tensor.shape[1:]
+        self.tensor = tensor
+    @torch.jit.unused
+    def to(self, *args: Any, **kwargs: Any) -> "BitMasks":
+        return BitMasks(self.tensor.to(*args, **kwargs))
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+    @torch.jit.unused
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "BitMasks":
+        """
+        Returns:
+            BitMasks: Create a new :class:`BitMasks` by indexing.
+        The following usage are allowed:
+        1. `new_masks = masks[3]`: return a `BitMasks` which contains only one mask.
+        2. `new_masks = masks[2:10]`: return a slice of masks.
+        3. `new_masks = masks[vector]`, where vector is a torch.BoolTensor
+           with `length = len(masks)`. Nonzero elements in the vector will be selected.
+        Note that the returned object might share storage with this object,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return BitMasks(self.tensor[item].unsqueeze(0))
+        m = self.tensor[item]
+        assert (
+            m.dim() == 3
+        ), "Indexing on BitMasks with {} returns a tensor with shape {}!".format(
+            item, m.shape
+        )
+        return BitMasks(m)
+    @torch.jit.unused
+    def __iter__(self) -> torch.Tensor:
+        yield from self.tensor
+    @torch.jit.unused
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+    def nonempty(self) -> torch.Tensor:
+        """
+        Find masks that are non-empty.
+        Returns:
+            Tensor: a BoolTensor which represents
+                whether each mask is empty (False) or non-empty (True).
+        """
+        return self.tensor.flatten(1).any(dim=1)
+    @staticmethod
+    def from_polygon_masks(
+        polygon_masks: Union["PolygonMasks", List[List[np.ndarray]]],
+        height: int,
+        width: int,
+    ) -> "BitMasks":
+        """
+        Args:
+            polygon_masks (list[list[ndarray]] or PolygonMasks)
+            height, width (int)
+        """
+        if isinstance(polygon_masks, PolygonMasks):
+            polygon_masks = polygon_masks.polygons
+        masks = [polygons_to_bitmask(p, height, width) for p in polygon_masks]
+        if len(masks):
+            return BitMasks(torch.stack([torch.from_numpy(x) for x in masks]))
+        else:
+            return BitMasks(torch.empty(0, height, width, dtype=torch.bool))
+    @staticmethod
+    def from_roi_masks(roi_masks: "ROIMasks", height: int, width: int) -> "BitMasks":
+        """
+        Args:
+            roi_masks:
+            height, width (int):
+        """
+        return roi_masks.to_bitmasks(height, width)
+    def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
+        """
+        Crop each bitmask by the given box, and resize results to (mask_size, mask_size).
+        This can be used to prepare training targets for Mask R-CNN.
+        It has less reconstruction error compared to rasterization with polygons.
+        However we observe no difference in accuracy,
+        but BitMasks requires more memory to store all the masks.
+        Args:
+            boxes (Tensor): Nx4 tensor storing the boxes for each mask
+            mask_size (int): the size of the rasterized mask.
+        Returns:
+            Tensor:
+                A bool tensor of shape (N, mask_size, mask_size), where
+                N is the number of predicted boxes for this image.
+        """
+        assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
+        device = self.tensor.device
+        batch_inds = torch.arange(len(boxes), device=device).to(dtype=boxes.dtype)[
+            :, None
+        ]
+        rois = torch.cat([batch_inds, boxes], dim=1)  # Nx5
+        bit_masks = self.tensor.to(dtype=torch.float32)
+        rois = rois.to(device=device)
+        output = (
+            ROIAlign((mask_size, mask_size), 1.0, 0, aligned=True)
+            .forward(bit_masks[:, None, :, :], rois)
+            .squeeze(1)
+        )
+        output = output >= 0.5
+        return output
+    def get_bounding_boxes(self) -> Boxes:
+        """
+        Returns:
+            Boxes: tight bounding boxes around bitmasks.
+            If a mask is empty, it's bounding box will be all zero.
+        """
+        boxes = torch.zeros(self.tensor.shape[0], 4, dtype=torch.float32)
+        x_any = torch.any(self.tensor, dim=1)
+        y_any = torch.any(self.tensor, dim=2)
+        for idx in range(self.tensor.shape[0]):
+            x = torch.where(x_any[idx, :])[0]
+            y = torch.where(y_any[idx, :])[0]
+            if len(x) > 0 and len(y) > 0:
+                boxes[idx, :] = torch.as_tensor(
+                    [x[0], y[0], x[-1] + 1, y[-1] + 1], dtype=torch.float32
+                )
+        return Boxes(boxes)
+    @staticmethod
+    def cat(bitmasks_list: List["BitMasks"]) -> "BitMasks":
+        """
+        Concatenates a list of BitMasks into a single BitMasks
+        Arguments:
+            bitmasks_list (list[BitMasks])
+        Returns:
+            BitMasks: the concatenated BitMasks
+        """
+        assert isinstance(bitmasks_list, (list, tuple))
+        assert len(bitmasks_list) > 0
+        assert all(isinstance(bitmask, BitMasks) for bitmask in bitmasks_list)
+        cat_bitmasks = type(bitmasks_list[0])(
+            torch.cat([bm.tensor for bm in bitmasks_list], dim=0)
+        )
+        return cat_bitmasks
+class PolygonMasks:
+    """
+    This class stores the segmentation masks for all objects in one image, in the form of polygons.
+    Attributes:
+        polygons: list[list[ndarray]]. Each ndarray is a float64 vector representing a polygon.
+    """
+    def __init__(self, polygons: List[List[Union[torch.Tensor, np.ndarray]]]):
+        """
+        Arguments:
+            polygons (list[list[np.ndarray]]): The first
+                level of the list correspond to individual instances,
+                the second level to all the polygons that compose the
+                instance, and the third level to the polygon coordinates.
+                The third level array should have the format of
+                [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+        """
+        if not isinstance(polygons, list):
+            raise ValueError(
+                "Cannot create PolygonMasks: Expect a list of list of polygons per image. "
+                "Got '{}' instead.".format(type(polygons))
+            )
+        def _make_array(t: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+            # Use float64 for higher precision, because why not?
+            # Always put polygons on CPU (self.to is a no-op) since they
+            # are supposed to be small tensors.
+            # May need to change this assumption if GPU placement becomes useful
+            if isinstance(t, torch.Tensor):
+                t = t.cpu().numpy()
+            return np.asarray(t).astype("float64")
+        def process_polygons(
+            polygons_per_instance: List[Union[torch.Tensor, np.ndarray]],
+        ) -> List[np.ndarray]:
+            if not isinstance(polygons_per_instance, list):
+                raise ValueError(
+                    "Cannot create polygons: Expect a list of polygons per instance. "
+                    "Got '{}' instead.".format(type(polygons_per_instance))
+                )
+            # transform each polygon to a numpy array
+            polygons_per_instance = [_make_array(p) for p in polygons_per_instance]
+            for polygon in polygons_per_instance:
+                if len(polygon) % 2 != 0 or len(polygon) < 6:
+                    raise ValueError(
+                        f"Cannot create a polygon from {len(polygon)} coordinates."
+                    )
+            return polygons_per_instance
+        self.polygons: List[List[np.ndarray]] = [
+            process_polygons(polygons_per_instance)
+            for polygons_per_instance in polygons
+        ]
+    def to(self, *args: Any, **kwargs: Any) -> "PolygonMasks":
+        return self
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cpu")
+    def get_bounding_boxes(self) -> Boxes:
+        """
+        Returns:
+            Boxes: tight bounding boxes around polygon masks.
+        """
+        boxes = torch.zeros(len(self.polygons), 4, dtype=torch.float32)
+        for idx, polygons_per_instance in enumerate(self.polygons):
+            minxy = torch.as_tensor([float("inf"), float("inf")], dtype=torch.float32)
+            maxxy = torch.zeros(2, dtype=torch.float32)
+            for polygon in polygons_per_instance:
+                coords = torch.from_numpy(polygon).view(-1, 2).to(dtype=torch.float32)
+                minxy = torch.min(minxy, torch.min(coords, dim=0).values)
+                maxxy = torch.max(maxxy, torch.max(coords, dim=0).values)
+            boxes[idx, :2] = minxy
+            boxes[idx, 2:] = maxxy
+        return Boxes(boxes)
+    def nonempty(self) -> torch.Tensor:
+        """
+        Find masks that are non-empty.
+        Returns:
+            Tensor:
+                a BoolTensor which represents whether each mask is empty (False) or not (True).
+        """
+        keep = [1 if len(polygon) > 0 else 0 for polygon in self.polygons]
+        return torch.from_numpy(np.asarray(keep, dtype=bool))
+    def __getitem__(
+        self, item: Union[int, slice, List[int], torch.BoolTensor]
+    ) -> "PolygonMasks":
+        """
+        Support indexing over the instances and return a `PolygonMasks` object.
+        `item` can be:
+        1. An integer. It will return an object with only one instance.
+        2. A slice. It will return an object with the selected instances.
+        3. A list[int]. It will return an object with the selected instances,
+           correpsonding to the indices in the list.
+        4. A vector mask of type BoolTensor, whose length is num_instances.
+           It will return an object with the instances whose mask is nonzero.
+        """
+        if isinstance(item, int):
+            selected_polygons = [self.polygons[item]]
+        elif isinstance(item, slice):
+            selected_polygons = self.polygons[item]
+        elif isinstance(item, list):
+            selected_polygons = [self.polygons[i] for i in item]
+        elif isinstance(item, torch.Tensor):
+            # Polygons is a list, so we have to move the indices back to CPU.
+            if item.dtype == torch.bool:
+                assert item.dim() == 1, item.shape
+                item = item.nonzero().squeeze(1).cpu().numpy().tolist()
+            elif item.dtype in [torch.int32, torch.int64]:
+                item = item.cpu().numpy().tolist()
+            else:
+                raise ValueError(
+                    "Unsupported tensor dtype={} for indexing!".format(item.dtype)
+                )
+            selected_polygons = [self.polygons[i] for i in item]
+        return PolygonMasks(selected_polygons)
+    def __iter__(self) -> Iterator[List[np.ndarray]]:
+        """
+        Yields:
+            list[ndarray]: the polygons for one instance.
+            Each Tensor is a float64 vector representing a polygon.
+        """
+        return iter(self.polygons)
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.polygons))
+        return s
+    def __len__(self) -> int:
+        return len(self.polygons)
+    def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
+        """
+        Crop each mask by the given box, and resize results to (mask_size, mask_size).
+        This can be used to prepare training targets for Mask R-CNN.
+        Args:
+            boxes (Tensor): Nx4 tensor storing the boxes for each mask
+            mask_size (int): the size of the rasterized mask.
+        Returns:
+            Tensor: A bool tensor of shape (N, mask_size, mask_size), where
+            N is the number of predicted boxes for this image.
+        """
+        assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
+        device = boxes.device
+        # Put boxes on the CPU, as the polygon representation is not efficient GPU-wise
+        # (several small tensors for representing a single instance mask)
+        boxes = boxes.to(torch.device("cpu"))
+        results = [
+            rasterize_polygons_within_box(poly, box.numpy(), mask_size)
+            for poly, box in zip(self.polygons, boxes)
+        ]
+        """
+        poly: list[list[float]], the polygons for one instance
+        box: a tensor of shape (4,)
+        """
+        if len(results) == 0:
+            return torch.empty(0, mask_size, mask_size, dtype=torch.bool, device=device)
+        return torch.stack(results, dim=0).to(device=device)
+    def area(self):
+        """
+        Computes area of the mask.
+        Only works with Polygons, using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+        Returns:
+            Tensor: a vector, area for each instance
+        """
+        area = []
+        for polygons_per_instance in self.polygons:
+            area_per_instance = 0
+            for p in polygons_per_instance:
+                area_per_instance += polygon_area(p[0::2], p[1::2])
+            area.append(area_per_instance)
+        return torch.tensor(area)
+    @staticmethod
+    def cat(polymasks_list: List["PolygonMasks"]) -> "PolygonMasks":
+        """
+        Concatenates a list of PolygonMasks into a single PolygonMasks
+        Arguments:
+            polymasks_list (list[PolygonMasks])
+        Returns:
+            PolygonMasks: the concatenated PolygonMasks
+        """
+        assert isinstance(polymasks_list, (list, tuple))
+        assert len(polymasks_list) > 0
+        assert all(isinstance(polymask, PolygonMasks) for polymask in polymasks_list)
+        cat_polymasks = type(polymasks_list[0])(
+            list(itertools.chain.from_iterable(pm.polygons for pm in polymasks_list))
+        )
+        return cat_polymasks
+class ROIMasks:
+    """
+    Represent masks by N smaller masks defined in some ROIs. Once ROI boxes are given,
+    full-image bitmask can be obtained by "pasting" the mask on the region defined
+    by the corresponding ROI box.
+    """
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor: (N, M, M) mask tensor that defines the mask within each ROI.
+        """
+        if tensor.dim() != 3:
+            raise ValueError("ROIMasks must take a masks of 3 dimension.")
+        self.tensor = tensor
+    def to(self, device: torch.device) -> "ROIMasks":
+        return ROIMasks(self.tensor.to(device))
+    @property
+    def device(self) -> device:
+        return self.tensor.device
+    def __len__(self):
+        return self.tensor.shape[0]
+    def __getitem__(self, item) -> "ROIMasks":
+        """
+        Returns:
+            ROIMasks: Create a new :class:`ROIMasks` by indexing.
+        The following usage are allowed:
+        1. `new_masks = masks[2:10]`: return a slice of masks.
+        2. `new_masks = masks[vector]`, where vector is a torch.BoolTensor
+           with `length = len(masks)`. Nonzero elements in the vector will be selected.
+        Note that the returned object might share storage with this object,
+        subject to Pytorch's indexing semantics.
+        """
+        t = self.tensor[item]
+        if t.dim() != 3:
+            raise ValueError(
+                f"Indexing on ROIMasks with {item} returns a tensor with shape {t.shape}!"
+            )
+        return ROIMasks(t)
+    @torch.jit.unused
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+    @torch.jit.unused
+    def to_bitmasks(self, boxes: torch.Tensor, height, width, threshold=0.5):
+        """
+        Args: see documentation of :func:`paste_masks_in_image`.
+        """
+        from detectron2.layers.mask_ops import (
+            _paste_masks_tensor_shape,
+            paste_masks_in_image,
+        )
+        if torch.jit.is_tracing():
+            if isinstance(height, torch.Tensor):
+                paste_func = _paste_masks_tensor_shape
+            else:
+                paste_func = paste_masks_in_image
+        else:
+            paste_func = retry_if_cuda_oom(paste_masks_in_image)
+        bitmasks = paste_func(
+            self.tensor, boxes.tensor, (height, width), threshold=threshold
+        )
+        return BitMasks(bitmasks)

sam3/agent/helpers/memory.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import logging
+from contextlib import contextmanager
+from functools import wraps
+import torch
+__all__ = ["retry_if_cuda_oom"]
+@contextmanager
+def _ignore_torch_cuda_oom():
+    """
+    A context which ignores CUDA OOM exception from pytorch.
+    """
+    try:
+        yield
+    except RuntimeError as e:
+        # NOTE: the string may change?
+        if "CUDA out of memory. " in str(e):
+            pass
+        else:
+            raise
+def retry_if_cuda_oom(func):
+    """
+    Makes a function retry itself after encountering
+    pytorch's CUDA OOM error.
+    It will first retry after calling `torch.cuda.empty_cache()`.
+    If that still fails, it will then retry by trying to convert inputs to CPUs.
+    In this case, it expects the function to dispatch to CPU implementation.
+    The return values may become CPU tensors as well and it's user's
+    responsibility to convert it back to CUDA tensor if needed.
+    Args:
+        func: a stateless callable that takes tensor-like objects as arguments
+    Returns:
+        a callable which retries `func` if OOM is encountered.
+    Examples:
+    ::
+        output = retry_if_cuda_oom(some_torch_function)(input1, input2)
+        # output may be on CPU even if inputs are on GPU
+    Note:
+        1. When converting inputs to CPU, it will only look at each argument and check
+           if it has `.device` and `.to` for conversion. Nested structures of tensors
+           are not supported.
+        2. Since the function might be called more than once, it has to be
+           stateless.
+    """
+    def maybe_to_cpu(x):
+        try:
+            like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
+        except AttributeError:
+            like_gpu_tensor = False
+        if like_gpu_tensor:
+            return x.to(device="cpu")
+        else:
+            return x
+    @wraps(func)
+    def wrapped(*args, **kwargs):
+        with _ignore_torch_cuda_oom():
+            return func(*args, **kwargs)
+        # Clear cache and retry
+        torch.cuda.empty_cache()
+        with _ignore_torch_cuda_oom():
+            return func(*args, **kwargs)
+        # Try on CPU. This slows down the code significantly, therefore print a notice.
+        logger = logging.getLogger(__name__)
+        logger.info(
+            "Attempting to copy inputs of {} to CPU due to CUDA OOM".format(str(func))
+        )
+        new_args = (maybe_to_cpu(x) for x in args)
+        new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
+        return func(*new_args, **new_kwargs)
+    return wrapped

sam3/agent/helpers/rle.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""Some utilities for RLE encoding that doesn't require downloading the masks to the cpu"""
+import numpy as np
+import torch
+from pycocotools import mask as mask_util
+@torch.no_grad()
+def rle_encode(orig_mask, return_areas=False):
+    """Encodes a collection of masks in RLE format
+    This function emulates the behavior of the COCO API's encode function, but
+    is executed partially on the GPU for faster execution.
+    Args:
+        mask (torch.Tensor): A mask of shape (N, H, W) with dtype=torch.bool
+        return_areas (bool): If True, add the areas of the masks as a part of
+            the RLE output dict under the "area" key. Default is False.
+    Returns:
+        str: The RLE encoded masks
+    """
+    assert orig_mask.ndim == 3, "Mask must be of shape (N, H, W)"
+    assert orig_mask.dtype == torch.bool, "Mask must have dtype=torch.bool"
+    if orig_mask.numel() == 0:
+        return []
+    # First, transpose the spatial dimensions.
+    # This is necessary because the COCO API uses Fortran order
+    mask = orig_mask.transpose(1, 2)
+    # Flatten the mask
+    flat_mask = mask.reshape(mask.shape[0], -1)
+    if return_areas:
+        mask_areas = flat_mask.sum(-1).tolist()
+    # Find the indices where the mask changes
+    differences = torch.ones(
+        mask.shape[0], flat_mask.shape[1] + 1, device=mask.device, dtype=torch.bool
+    )
+    differences[:, 1:-1] = flat_mask[:, :-1] != flat_mask[:, 1:]
+    differences[:, 0] = flat_mask[:, 0]
+    _, change_indices = torch.where(differences)
+    try:
+        boundaries = torch.cumsum(differences.sum(-1), 0).cpu()
+    except RuntimeError as _:
+        boundaries = torch.cumsum(differences.cpu().sum(-1), 0)
+    change_indices_clone = change_indices.clone()
+    # First pass computes the RLEs on GPU, in a flatten format
+    for i in range(mask.shape[0]):
+        # Get the change indices for this batch item
+        beg = 0 if i == 0 else boundaries[i - 1].item()
+        end = boundaries[i].item()
+        change_indices[beg + 1 : end] -= change_indices_clone[beg : end - 1]
+    # Now we can split the RLES of each batch item, and convert them to strings
+    # No more gpu at this point
+    change_indices = change_indices.tolist()
+    batch_rles = []
+    # Process each mask in the batch separately
+    for i in range(mask.shape[0]):
+        beg = 0 if i == 0 else boundaries[i - 1].item()
+        end = boundaries[i].item()
+        run_lengths = change_indices[beg:end]
+        uncompressed_rle = {"counts": run_lengths, "size": list(orig_mask.shape[1:])}
+        h, w = uncompressed_rle["size"]
+        rle = mask_util.frPyObjects(uncompressed_rle, h, w)
+        rle["counts"] = rle["counts"].decode("utf-8")
+        if return_areas:
+            rle["area"] = mask_areas[i]
+        batch_rles.append(rle)
+    return batch_rles
+def robust_rle_encode(masks):
+    """Encodes a collection of masks in RLE format. Uses the gpu version fist, falls back to the cpu version if it fails"""
+    assert masks.ndim == 3, "Mask must be of shape (N, H, W)"
+    assert masks.dtype == torch.bool, "Mask must have dtype=torch.bool"
+    try:
+        return rle_encode(masks)
+    except RuntimeError as _:
+        masks = masks.cpu().numpy()
+        rles = [
+            mask_util.encode(
+                np.array(mask[:, :, np.newaxis], dtype=np.uint8, order="F")
+            )[0]
+            for mask in masks
+        ]
+        for rle in rles:
+            rle["counts"] = rle["counts"].decode("utf-8")
+        return rles
+def ann_to_rle(segm, im_info):
+    """Convert annotation which can be polygons, uncompressed RLE to RLE.
+    Args:
+        ann (dict) : annotation object
+    Returns:
+        ann (rle)
+    """
+    h, w = im_info["height"], im_info["width"]
+    if isinstance(segm, list):
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = mask_util.frPyObjects(segm, h, w)
+        rle = mask_util.merge(rles)
+    elif isinstance(segm["counts"], list):
+        # uncompressed RLE
+        rle = mask_util.frPyObjects(segm, h, w)
+    else:
+        # rle
+        rle = segm
+    return rle

sam3/agent/helpers/roi_align.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+from torch import nn
+from torchvision.ops import roi_align
+# NOTE: torchvision's RoIAlign has a different default aligned=False
+class ROIAlign(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+            aligned (bool): if False, use the legacy implementation in
+                Detectron. If True, align the results more perfectly.
+        Note:
+            The meaning of aligned=True:
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5). But the original
+            roi_align (aligned=False) does not subtract the 0.5 when computing neighboring
+            pixel indices and therefore it uses pixels with a slightly incorrect alignment
+            (relative to our pixel model) when performing bilinear interpolation.
+            With `aligned=True`,
+            we first appropriately scale the ROI and then shift it by -0.5
+            prior to calling roi_align. This produces the correct neighbors; see
+            detectron2/tests/test_roi_align.py for verification.
+            The difference does not make a difference to the model's performance if
+            ROIAlign is used together with conv layers.
+        """
+        super().__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+        self.aligned = aligned
+        from torchvision import __version__
+        version = tuple(int(x) for x in __version__.split(".")[:2])
+        # https://github.com/pytorch/vision/pull/2438
+        assert version >= (0, 7), "Require torchvision >= 0.7"
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
+        """
+        assert rois.dim() == 2 and rois.size(1) == 5
+        if input.is_quantized:
+            input = input.dequantize()
+        return roi_align(
+            input,
+            rois.to(dtype=input.dtype),
+            self.output_size,
+            self.spatial_scale,
+            self.sampling_ratio,
+            self.aligned,
+        )
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ", aligned=" + str(self.aligned)
+        tmpstr += ")"
+        return tmpstr

sam3/agent/helpers/rotated_boxes.py ADDED Viewed

	@@ -0,0 +1,533 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+from __future__ import absolute_import, division, print_function, unicode_literals
+import math
+from typing import List, Tuple
+import torch
+# from detectron2.layers.rotated_boxes import pairwise_iou_rotated
+from .boxes import Boxes
+def pairwise_iou_rotated(boxes1, boxes2):
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in
+    (x_center, y_center, width, height, angle) format.
+    Arguments:
+        boxes1 (Tensor[N, 5])
+        boxes2 (Tensor[M, 5])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    return torch.ops.detectron2.box_iou_rotated(boxes1, boxes2)
+class RotatedBoxes(Boxes):
+    """
+    This structure stores a list of rotated boxes as a Nx5 torch.Tensor.
+    It supports some common methods about boxes
+    (`area`, `clip`, `nonempty`, etc),
+    and also behaves like a Tensor
+    (support indexing, `to(device)`, `.device`, and iteration over all boxes)
+    """
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor (Tensor[float]): a Nx5 matrix.  Each row is
+                (x_center, y_center, width, height, angle),
+                in which angle is represented in degrees.
+                While there's no strict range restriction for it,
+                the recommended principal range is between [-180, 180) degrees.
+        Assume we have a horizontal box B = (x_center, y_center, width, height),
+        where width is along the x-axis and height is along the y-axis.
+        The rotated box B_rot (x_center, y_center, width, height, angle)
+        can be seen as:
+        1. When angle == 0:
+           B_rot == B
+        2. When angle > 0:
+           B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CCW;
+        3. When angle < 0:
+           B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CW.
+        Mathematically, since the right-handed coordinate system for image space
+        is (y, x), where y is top->down and x is left->right, the 4 vertices of the
+        rotated rectangle :math:`(yr_i, xr_i)` (i = 1, 2, 3, 4) can be obtained from
+        the vertices of the horizontal rectangle :math:`(y_i, x_i)` (i = 1, 2, 3, 4)
+        in the following way (:math:`\\theta = angle*\\pi/180` is the angle in radians,
+        :math:`(y_c, x_c)` is the center of the rectangle):
+        .. math::
+            yr_i = \\cos(\\theta) (y_i - y_c) - \\sin(\\theta) (x_i - x_c) + y_c,
+            xr_i = \\sin(\\theta) (y_i - y_c) + \\cos(\\theta) (x_i - x_c) + x_c,
+        which is the standard rigid-body rotation transformation.
+        Intuitively, the angle is
+        (1) the rotation angle from y-axis in image space
+        to the height vector (top->down in the box's local coordinate system)
+        of the box in CCW, and
+        (2) the rotation angle from x-axis in image space
+        to the width vector (left->right in the box's local coordinate system)
+        of the box in CCW.
+        More intuitively, consider the following horizontal box ABCD represented
+        in (x1, y1, x2, y2): (3, 2, 7, 4),
+        covering the [3, 7] x [2, 4] region of the continuous coordinate system
+        which looks like this:
+        .. code:: none
+            O--------> x
+            |
+            |  A---B
+            |  |   |
+            |  D---C
+            |
+            v y
+        Note that each capital letter represents one 0-dimensional geometric point
+        instead of a 'square pixel' here.
+        In the example above, using (x, y) to represent a point we have:
+        .. math::
+            O = (0, 0), A = (3, 2), B = (7, 2), C = (7, 4), D = (3, 4)
+        We name vector AB = vector DC as the width vector in box's local coordinate system, and
+        vector AD = vector BC as the height vector in box's local coordinate system. Initially,
+        when angle = 0 degree, they're aligned with the positive directions of x-axis and y-axis
+        in the image space, respectively.
+        For better illustration, we denote the center of the box as E,
+        .. code:: none
+            O--------> x
+            |
+            |  A---B
+            |  | E |
+            |  D---C
+            |
+            v y
+        where the center E = ((3+7)/2, (2+4)/2) = (5, 3).
+        Also,
+        .. math::
+            width = |AB| = |CD| = 7 - 3 = 4,
+            height = |AD| = |BC| = 4 - 2 = 2.
+        Therefore, the corresponding representation for the same shape in rotated box in
+        (x_center, y_center, width, height, angle) format is:
+        (5, 3, 4, 2, 0),
+        Now, let's consider (5, 3, 4, 2, 90), which is rotated by 90 degrees
+        CCW (counter-clockwise) by definition. It looks like this:
+        .. code:: none
+            O--------> x
+            |   B-C
+            |   | |
+            |   |E|
+            |   | |
+            |   A-D
+            v y
+        The center E is still located at the same point (5, 3), while the vertices
+        ABCD are rotated by 90 degrees CCW with regard to E:
+        A = (4, 5), B = (4, 1), C = (6, 1), D = (6, 5)
+        Here, 90 degrees can be seen as the CCW angle to rotate from y-axis to
+        vector AD or vector BC (the top->down height vector in box's local coordinate system),
+        or the CCW angle to rotate from x-axis to vector AB or vector DC (the left->right
+        width vector in box's local coordinate system).
+        .. math::
+            width = |AB| = |CD| = 5 - 1 = 4,
+            height = |AD| = |BC| = 6 - 4 = 2.
+        Next, how about (5, 3, 4, 2, -90), which is rotated by 90 degrees CW (clockwise)
+        by definition? It looks like this:
+        .. code:: none
+            O--------> x
+            |   D-A
+            |   | |
+            |   |E|
+            |   | |
+            |   C-B
+            v y
+        The center E is still located at the same point (5, 3), while the vertices
+        ABCD are rotated by 90 degrees CW with regard to E:
+        A = (6, 1), B = (6, 5), C = (4, 5), D = (4, 1)
+        .. math::
+            width = |AB| = |CD| = 5 - 1 = 4,
+            height = |AD| = |BC| = 6 - 4 = 2.
+        This covers exactly the same region as (5, 3, 4, 2, 90) does, and their IoU
+        will be 1. However, these two will generate different RoI Pooling results and
+        should not be treated as an identical box.
+        On the other hand, it's easy to see that (X, Y, W, H, A) is identical to
+        (X, Y, W, H, A+360N), for any integer N. For example (5, 3, 4, 2, 270) would be
+        identical to (5, 3, 4, 2, -90), because rotating the shape 270 degrees CCW is
+        equivalent to rotating the same shape 90 degrees CW.
+        We could rotate further to get (5, 3, 4, 2, 180), or (5, 3, 4, 2, -180):
+        .. code:: none
+            O--------> x
+            |
+            |  C---D
+            |  | E |
+            |  B---A
+            |
+            v y
+        .. math::
+            A = (7, 4), B = (3, 4), C = (3, 2), D = (7, 2),
+            width = |AB| = |CD| = 7 - 3 = 4,
+            height = |AD| = |BC| = 4 - 2 = 2.
+        Finally, this is a very inaccurate (heavily quantized) illustration of
+        how (5, 3, 4, 2, 60) looks like in case anyone wonders:
+        .. code:: none
+            O--------> x
+            |     B\
+            |    /  C
+            |   /E /
+            |  A  /
+            |   `D
+            v y
+        It's still a rectangle with center of (5, 3), width of 4 and height of 2,
+        but its angle (and thus orientation) is somewhere between
+        (5, 3, 4, 2, 0) and (5, 3, 4, 2, 90).
+        """
+        device = (
+            tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
+        )
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does not depend on
+            # the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, 5)).to(dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == 5, tensor.size()
+        self.tensor = tensor
+    def clone(self) -> "RotatedBoxes":
+        """
+        Clone the RotatedBoxes.
+        Returns:
+            RotatedBoxes
+        """
+        return RotatedBoxes(self.tensor.clone())
+    def to(self, device: torch.device, non_blocking: bool = False):
+        # Boxes are assumed float32 and does not support to(dtype)
+        return RotatedBoxes(self.tensor.to(device=device, non_blocking=non_blocking))
+    def area(self) -> torch.Tensor:
+        """
+        Computes the area of all the boxes.
+        Returns:
+            torch.Tensor: a vector with areas of each box.
+        """
+        box = self.tensor
+        area = box[:, 2] * box[:, 3]
+        return area
+    # Avoid in-place operations so that we can torchscript; NOTE: this creates a new tensor
+    def normalize_angles(self) -> None:
+        """
+        Restrict angles to the range of [-180, 180) degrees
+        """
+        angle_tensor = (self.tensor[:, 4] + 180.0) % 360.0 - 180.0
+        self.tensor = torch.cat((self.tensor[:, :4], angle_tensor[:, None]), dim=1)
+    def clip(
+        self, box_size: Tuple[int, int], clip_angle_threshold: float = 1.0
+    ) -> None:
+        """
+        Clip (in place) the boxes by limiting x coordinates to the range [0, width]
+        and y coordinates to the range [0, height].
+        For RRPN:
+        Only clip boxes that are almost horizontal with a tolerance of
+        clip_angle_threshold to maintain backward compatibility.
+        Rotated boxes beyond this threshold are not clipped for two reasons:
+        1. There are potentially multiple ways to clip a rotated box to make it
+           fit within the image.
+        2. It's tricky to make the entire rectangular box fit within the image
+           and still be able to not leave out pixels of interest.
+        Therefore we rely on ops like RoIAlignRotated to safely handle this.
+        Args:
+            box_size (height, width): The clipping box's size.
+            clip_angle_threshold:
+                Iff. abs(normalized(angle)) <= clip_angle_threshold (in degrees),
+                we do the clipping as horizontal boxes.
+        """
+        h, w = box_size
+        # normalize angles to be within (-180, 180] degrees
+        self.normalize_angles()
+        idx = torch.where(torch.abs(self.tensor[:, 4]) <= clip_angle_threshold)[0]
+        # convert to (x1, y1, x2, y2)
+        x1 = self.tensor[idx, 0] - self.tensor[idx, 2] / 2.0
+        y1 = self.tensor[idx, 1] - self.tensor[idx, 3] / 2.0
+        x2 = self.tensor[idx, 0] + self.tensor[idx, 2] / 2.0
+        y2 = self.tensor[idx, 1] + self.tensor[idx, 3] / 2.0
+        # clip
+        x1.clamp_(min=0, max=w)
+        y1.clamp_(min=0, max=h)
+        x2.clamp_(min=0, max=w)
+        y2.clamp_(min=0, max=h)
+        # convert back to (xc, yc, w, h)
+        self.tensor[idx, 0] = (x1 + x2) / 2.0
+        self.tensor[idx, 1] = (y1 + y2) / 2.0
+        # make sure widths and heights do not increase due to numerical errors
+        self.tensor[idx, 2] = torch.min(self.tensor[idx, 2], x2 - x1)
+        self.tensor[idx, 3] = torch.min(self.tensor[idx, 3], y2 - y1)
+    def nonempty(self, threshold: float = 0.0) -> torch.Tensor:
+        """
+        Find boxes that are non-empty.
+        A box is considered empty, if either of its side is no larger than threshold.
+        Returns:
+            Tensor: a binary vector which represents
+            whether each box is empty (False) or non-empty (True).
+        """
+        box = self.tensor
+        widths = box[:, 2]
+        heights = box[:, 3]
+        keep = (widths > threshold) & (heights > threshold)
+        return keep
+    def __getitem__(self, item) -> "RotatedBoxes":
+        """
+        Returns:
+            RotatedBoxes: Create a new :class:`RotatedBoxes` by indexing.
+        The following usage are allowed:
+        1. `new_boxes = boxes[3]`: return a `RotatedBoxes` which contains only one box.
+        2. `new_boxes = boxes[2:10]`: return a slice of boxes.
+        3. `new_boxes = boxes[vector]`, where vector is a torch.ByteTensor
+           with `length = len(boxes)`. Nonzero elements in the vector will be selected.
+        Note that the returned RotatedBoxes might share storage with this RotatedBoxes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return RotatedBoxes(self.tensor[item].view(1, -1))
+        b = self.tensor[item]
+        assert (
+            b.dim() == 2
+        ), "Indexing on RotatedBoxes with {} failed to return a matrix!".format(item)
+        return RotatedBoxes(b)
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+    def __repr__(self) -> str:
+        return "RotatedBoxes(" + str(self.tensor) + ")"
+    def inside_box(
+        self, box_size: Tuple[int, int], boundary_threshold: int = 0
+    ) -> torch.Tensor:
+        """
+        Args:
+            box_size (height, width): Size of the reference box covering
+                [0, width] x [0, height]
+            boundary_threshold (int): Boxes that extend beyond the reference box
+                boundary by more than boundary_threshold are considered "outside".
+        For RRPN, it might not be necessary to call this function since it's common
+        for rotated box to extend to outside of the image boundaries
+        (the clip function only clips the near-horizontal boxes)
+        Returns:
+            a binary vector, indicating whether each box is inside the reference box.
+        """
+        height, width = box_size
+        cnt_x = self.tensor[..., 0]
+        cnt_y = self.tensor[..., 1]
+        half_w = self.tensor[..., 2] / 2.0
+        half_h = self.tensor[..., 3] / 2.0
+        a = self.tensor[..., 4]
+        c = torch.abs(torch.cos(a * math.pi / 180.0))
+        s = torch.abs(torch.sin(a * math.pi / 180.0))
+        # This basically computes the horizontal bounding rectangle of the rotated box
+        max_rect_dx = c * half_w + s * half_h
+        max_rect_dy = c * half_h + s * half_w
+        inds_inside = (
+            (cnt_x - max_rect_dx >= -boundary_threshold)
+            & (cnt_y - max_rect_dy >= -boundary_threshold)
+            & (cnt_x + max_rect_dx < width + boundary_threshold)
+            & (cnt_y + max_rect_dy < height + boundary_threshold)
+        )
+        return inds_inside
+    def get_centers(self) -> torch.Tensor:
+        """
+        Returns:
+            The box centers in a Nx2 array of (x, y).
+        """
+        return self.tensor[:, :2]
+    def scale(self, scale_x: float, scale_y: float) -> None:
+        """
+        Scale the rotated box with horizontal and vertical scaling factors
+        Note: when scale_factor_x != scale_factor_y,
+        the rotated box does not preserve the rectangular shape when the angle
+        is not a multiple of 90 degrees under resize transformation.
+        Instead, the shape is a parallelogram (that has skew)
+        Here we make an approximation by fitting a rotated rectangle to the parallelogram.
+        """
+        self.tensor[:, 0] *= scale_x
+        self.tensor[:, 1] *= scale_y
+        theta = self.tensor[:, 4] * math.pi / 180.0
+        c = torch.cos(theta)
+        s = torch.sin(theta)
+        # In image space, y is top->down and x is left->right
+        # Consider the local coordintate system for the rotated box,
+        # where the box center is located at (0, 0), and the four vertices ABCD are
+        # A(-w / 2, -h / 2), B(w / 2, -h / 2), C(w / 2, h / 2), D(-w / 2, h / 2)
+        # the midpoint of the left edge AD of the rotated box E is:
+        # E = (A+D)/2 = (-w / 2, 0)
+        # the midpoint of the top edge AB of the rotated box F is:
+        # F(0, -h / 2)
+        # To get the old coordinates in the global system, apply the rotation transformation
+        # (Note: the right-handed coordinate system for image space is yOx):
+        # (old_x, old_y) = (s * y + c * x, c * y - s * x)
+        # E(old) = (s * 0 + c * (-w/2), c * 0 - s * (-w/2)) = (-c * w / 2, s * w / 2)
+        # F(old) = (s * (-h / 2) + c * 0, c * (-h / 2) - s * 0) = (-s * h / 2, -c * h / 2)
+        # After applying the scaling factor (sfx, sfy):
+        # E(new) = (-sfx * c * w / 2, sfy * s * w / 2)
+        # F(new) = (-sfx * s * h / 2, -sfy * c * h / 2)
+        # The new width after scaling tranformation becomes:
+        # w(new) = |E(new) - O| * 2
+        #        = sqrt[(sfx * c * w / 2)^2 + (sfy * s * w / 2)^2] * 2
+        #        = sqrt[(sfx * c)^2 + (sfy * s)^2] * w
+        # i.e., scale_factor_w = sqrt[(sfx * c)^2 + (sfy * s)^2]
+        #
+        # For example,
+        # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_w == scale_factor_x;
+        # when |angle| = 90, c = 0, |s| = 1, scale_factor_w == scale_factor_y
+        self.tensor[:, 2] *= torch.sqrt((scale_x * c) ** 2 + (scale_y * s) ** 2)
+        # h(new) = |F(new) - O| * 2
+        #        = sqrt[(sfx * s * h / 2)^2 + (sfy * c * h / 2)^2] * 2
+        #        = sqrt[(sfx * s)^2 + (sfy * c)^2] * h
+        # i.e., scale_factor_h = sqrt[(sfx * s)^2 + (sfy * c)^2]
+        #
+        # For example,
+        # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_h == scale_factor_y;
+        # when |angle| = 90, c = 0, |s| = 1, scale_factor_h == scale_factor_x
+        self.tensor[:, 3] *= torch.sqrt((scale_x * s) ** 2 + (scale_y * c) ** 2)
+        # The angle is the rotation angle from y-axis in image space to the height
+        # vector (top->down in the box's local coordinate system) of the box in CCW.
+        #
+        # angle(new) = angle_yOx(O - F(new))
+        #            = angle_yOx( (sfx * s * h / 2, sfy * c * h / 2) )
+        #            = atan2(sfx * s * h / 2, sfy * c * h / 2)
+        #            = atan2(sfx * s, sfy * c)
+        #
+        # For example,
+        # when sfx == sfy, angle(new) == atan2(s, c) == angle(old)
+        self.tensor[:, 4] = torch.atan2(scale_x * s, scale_y * c) * 180 / math.pi
+    @classmethod
+    def cat(cls, boxes_list: List["RotatedBoxes"]) -> "RotatedBoxes":
+        """
+        Concatenates a list of RotatedBoxes into a single RotatedBoxes
+        Arguments:
+            boxes_list (list[RotatedBoxes])
+        Returns:
+            RotatedBoxes: the concatenated RotatedBoxes
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all([isinstance(box, RotatedBoxes) for box in boxes_list])
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0))
+        return cat_boxes
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+    @torch.jit.unused
+    def __iter__(self):
+        """
+        Yield a box as a Tensor of shape (5,) at a time.
+        """
+        yield from self.tensor
+def pairwise_iou(boxes1: RotatedBoxes, boxes2: RotatedBoxes) -> None:
+    """
+    Given two lists of rotated boxes of size N and M,
+    compute the IoU (intersection over union)
+    between **all** N x M pairs of boxes.
+    The box order must be (x_center, y_center, width, height, angle).
+    Args:
+        boxes1, boxes2 (RotatedBoxes):
+            two `RotatedBoxes`. Contains N & M rotated boxes, respectively.
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+    return pairwise_iou_rotated(boxes1.tensor, boxes2.tensor)

sam3/agent/helpers/som_utils.py ADDED Viewed

	@@ -0,0 +1,406 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import colorsys
+from dataclasses import dataclass
+from typing import List, Tuple
+import cv2
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import numpy as np
+import pycocotools.mask as mask_utils
+def rgb_to_hex(rgb_color):
+    """
+    Convert a rgb color to hex color.
+    Args:
+        rgb_color (tuple/list of ints): RGB color in tuple or list format.
+    Returns:
+        str: Hex color.
+    Example:
+        ```
+        >>> rgb_to_hex((255, 0, 244))
+        '#ff00ff'
+        ```
+    """
+    return "#" + "".join([hex(c)[2:].zfill(2) for c in rgb_color])
+# DEFAULT_COLOR_HEX_TO_NAME = {
+#     rgb_to_hex((255, 0, 0)): "red",
+#     rgb_to_hex((0, 255, 0)): "lime",
+#     rgb_to_hex((0, 0, 255)): "blue",
+#     rgb_to_hex((255, 255, 0)): "yellow",
+#     rgb_to_hex((255, 0, 255)): "fuchsia",
+#     rgb_to_hex((0, 255, 255)): "aqua",
+#     rgb_to_hex((255, 165, 0)): "orange",
+#     rgb_to_hex((128, 0, 128)): "purple",
+#     rgb_to_hex((255, 215, 0)): "gold",
+# }
+# Assuming rgb_to_hex is a function that converts an (R, G, B) tuple to a hex string.
+# For example: def rgb_to_hex(rgb): return '#%02x%02x%02x' % rgb
+DEFAULT_COLOR_HEX_TO_NAME = {
+    # The top 20 approved colors
+    rgb_to_hex((255, 255, 0)): "yellow",
+    rgb_to_hex((0, 255, 0)): "lime",
+    rgb_to_hex((0, 255, 255)): "cyan",
+    rgb_to_hex((255, 0, 255)): "magenta",
+    rgb_to_hex((255, 0, 0)): "red",
+    rgb_to_hex((255, 127, 0)): "orange",
+    rgb_to_hex((127, 255, 0)): "chartreuse",
+    rgb_to_hex((0, 255, 127)): "spring green",
+    rgb_to_hex((255, 0, 127)): "rose",
+    rgb_to_hex((127, 0, 255)): "violet",
+    rgb_to_hex((192, 255, 0)): "electric lime",
+    rgb_to_hex((255, 192, 0)): "vivid orange",
+    rgb_to_hex((0, 255, 192)): "turquoise",
+    rgb_to_hex((192, 0, 255)): "bright violet",
+    rgb_to_hex((255, 0, 192)): "bright pink",
+    rgb_to_hex((255, 64, 0)): "fiery orange",
+    rgb_to_hex((64, 255, 0)): "bright chartreuse",
+    rgb_to_hex((0, 255, 64)): "malachite",
+    rgb_to_hex((64, 0, 255)): "deep violet",
+    rgb_to_hex((255, 0, 64)): "hot pink",
+}
+DEFAULT_COLOR_PALETTE = list(DEFAULT_COLOR_HEX_TO_NAME.keys())
+def _validate_color_hex(color_hex: str):
+    color_hex = color_hex.lstrip("#")
+    if not all(c in "0123456789abcdefABCDEF" for c in color_hex):
+        raise ValueError("Invalid characters in color hash")
+    if len(color_hex) not in (3, 6):
+        raise ValueError("Invalid length of color hash")
+# copied from https://github.com/roboflow/supervision/blob/c8f557af0c61b5c03392bad2cc36c8835598b1e1/supervision/draw/color.py
+@dataclass
+class Color:
+    """
+    Represents a color in RGB format.
+    Attributes:
+        r (int): Red channel.
+        g (int): Green channel.
+        b (int): Blue channel.
+    """
+    r: int
+    g: int
+    b: int
+    @classmethod
+    def from_hex(cls, color_hex: str):
+        """
+        Create a Color instance from a hex string.
+        Args:
+            color_hex (str): Hex string of the color.
+        Returns:
+            Color: Instance representing the color.
+        Example:
+            ```
+            >>> Color.from_hex('#ff00ff')
+            Color(r=255, g=0, b=255)
+            ```
+        """
+        _validate_color_hex(color_hex)
+        color_hex = color_hex.lstrip("#")
+        if len(color_hex) == 3:
+            color_hex = "".join(c * 2 for c in color_hex)
+        r, g, b = (int(color_hex[i : i + 2], 16) for i in range(0, 6, 2))
+        return cls(r, g, b)
+    @classmethod
+    def to_hex(cls, color):
+        """
+        Convert a Color instance to a hex string.
+        Args:
+            color (Color): Color instance of color.
+        Returns:
+            Color: a hex string.
+        """
+        return rgb_to_hex((color.r, color.g, color.b))
+    def as_rgb(self) -> Tuple[int, int, int]:
+        """
+        Returns the color as an RGB tuple.
+        Returns:
+            Tuple[int, int, int]: RGB tuple.
+        Example:
+            ```
+            >>> color.as_rgb()
+            (255, 0, 255)
+            ```
+        """
+        return self.r, self.g, self.b
+    def as_bgr(self) -> Tuple[int, int, int]:
+        """
+        Returns the color as a BGR tuple.
+        Returns:
+            Tuple[int, int, int]: BGR tuple.
+        Example:
+            ```
+            >>> color.as_bgr()
+            (255, 0, 255)
+            ```
+        """
+        return self.b, self.g, self.r
+    @classmethod
+    def white(cls):
+        return Color.from_hex(color_hex="#ffffff")
+    @classmethod
+    def black(cls):
+        return Color.from_hex(color_hex="#000000")
+    @classmethod
+    def red(cls):
+        return Color.from_hex(color_hex="#ff0000")
+    @classmethod
+    def green(cls):
+        return Color.from_hex(color_hex="#00ff00")
+    @classmethod
+    def blue(cls):
+        return Color.from_hex(color_hex="#0000ff")
+@dataclass
+class ColorPalette:
+    colors: List[Color]
+    @classmethod
+    def default(cls):
+        """
+        Returns a default color palette.
+        Returns:
+            ColorPalette: A ColorPalette instance with default colors.
+        Example:
+            ```
+            >>> ColorPalette.default()
+            ColorPalette(colors=[Color(r=255, g=0, b=0), Color(r=0, g=255, b=0), ...])
+            ```
+        """
+        return ColorPalette.from_hex(color_hex_list=DEFAULT_COLOR_PALETTE)
+    @classmethod
+    def from_hex(cls, color_hex_list: List[str]):
+        """
+        Create a ColorPalette instance from a list of hex strings.
+        Args:
+            color_hex_list (List[str]): List of color hex strings.
+        Returns:
+            ColorPalette: A ColorPalette instance.
+        Example:
+            ```
+            >>> ColorPalette.from_hex(['#ff0000', '#00ff00', '#0000ff'])
+            ColorPalette(colors=[Color(r=255, g=0, b=0), Color(r=0, g=255, b=0), ...])
+            ```
+        """
+        colors = [Color.from_hex(color_hex) for color_hex in color_hex_list]
+        return cls(colors)
+    def by_idx(self, idx: int) -> Color:
+        """
+        Return the color at a given index in the palette.
+        Args:
+            idx (int): Index of the color in the palette.
+        Returns:
+            Color: Color at the given index.
+        Example:
+            ```
+            >>> color_palette.by_idx(1)
+            Color(r=0, g=255, b=0)
+            ```
+        """
+        if idx < 0:
+            raise ValueError("idx argument should not be negative")
+        idx = idx % len(self.colors)
+        return self.colors[idx]
+    def find_farthest_color(self, img_array):
+        """
+        Return the color that is the farthest from the given color.
+        Args:
+            img_array (np array): any *x3 np array, 3 is the RGB color channel.
+        Returns:
+            Color: Farthest color.
+        """
+        # Reshape the image array for broadcasting
+        img_array = img_array.reshape((-1, 3))
+        # Convert colors dictionary to a NumPy array
+        color_values = np.array([[c.r, c.g, c.b] for c in self.colors])
+        # Calculate the Euclidean distance between the colors and each pixel in the image
+        # Broadcasting happens here: img_array shape is (num_pixels, 3), color_values shape is (num_colors, 3)
+        distances = np.sqrt(
+            np.sum((img_array[:, np.newaxis, :] - color_values) ** 2, axis=2)
+        )
+        # Average the distances for each color
+        mean_distances = np.mean(distances, axis=0)
+        # return the farthest color
+        farthest_idx = np.argmax(mean_distances)
+        farthest_color = self.colors[farthest_idx]
+        farthest_color_hex = Color.to_hex(farthest_color)
+        if farthest_color_hex in DEFAULT_COLOR_HEX_TO_NAME:
+            farthest_color_name = DEFAULT_COLOR_HEX_TO_NAME[farthest_color_hex]
+        else:
+            farthest_color_name = "unknown"
+        return farthest_color, farthest_color_name
+def draw_box(ax, box_coord, alpha=0.8, edge_color="g", line_style="-", linewidth=2.0):
+    x0, y0, width, height = box_coord
+    ax.add_patch(
+        mpl.patches.Rectangle(
+            (x0, y0),
+            width,
+            height,
+            fill=False,
+            edgecolor=edge_color,
+            linewidth=linewidth,
+            alpha=alpha,
+            linestyle=line_style,
+        )
+    )
+def draw_text(
+    ax,
+    text,
+    position,
+    font_size=None,
+    color="g",
+    horizontal_alignment="left",
+    rotation=0,
+):
+    if not font_size:
+        font_size = mpl.rcParams["font.size"]
+    color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+    color[np.argmax(color)] = max(0.8, np.max(color))
+    x, y = position
+    ax.text(
+        x,
+        y,
+        text,
+        size=font_size,
+        family="sans-serif",
+        bbox={"facecolor": "none", "alpha": 0.5, "pad": 0.7, "edgecolor": "none"},
+        verticalalignment="top",
+        horizontalalignment=horizontal_alignment,
+        color=color,
+        rotation=rotation,
+    )
+def draw_mask(
+    ax, rle, color, show_holes=True, alpha=0.15, upsample_factor=1.0, rle_upsampled=None
+):
+    if isinstance(rle, dict):
+        mask = mask_utils.decode(rle)
+    elif isinstance(rle, np.ndarray):
+        mask = rle
+    else:
+        raise ValueError(f"Unsupported type for rle: {type(rle)}")
+    mask_upsampled = None
+    if upsample_factor > 1.0 and show_holes:
+        assert rle_upsampled is not None
+        if isinstance(rle_upsampled, dict):
+            mask_upsampled = mask_utils.decode(rle_upsampled)
+        elif isinstance(rle_upsampled, np.ndarray):
+            mask_upsampled = rle_upsampled
+        else:
+            raise ValueError(f"Unsupported type for rle: {type(rle)}")
+    if show_holes:
+        if mask_upsampled is None:
+            mask_upsampled = mask
+        h, w = mask_upsampled.shape
+        mask_img = np.zeros((h, w, 4))
+        mask_img[:, :, :-1] = color[np.newaxis, np.newaxis, :]
+        mask_img[:, :, -1] = mask_upsampled * alpha
+        ax.imshow(mask_img)
+    *_, contours, _ = cv2.findContours(
+        mask.astype(np.uint8).copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
+    )
+    upsampled_contours = [(cont + 0.5) * upsample_factor - 0.5 for cont in contours]
+    facecolor = (0, 0, 0, 0) if show_holes else color
+    if alpha > 0.8:
+        edge_color = _change_color_brightness(color, brightness_factor=-0.7)
+    else:
+        edge_color = color
+    for cont in upsampled_contours:
+        polygon = mpl.patches.Polygon(
+            [el[0] for el in cont],
+            edgecolor=edge_color,
+            linewidth=2.0,
+            facecolor=facecolor,
+        )
+        ax.add_patch(polygon)
+def _change_color_brightness(color, brightness_factor):
+    """
+    Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+    less or more saturation than the original color.
+    Args:
+        color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+            formats that are accepted.
+        brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+            0 will correspond to no change, a factor in [-1.0, 0) range will result in
+            a darker color and a factor in (0, 1.0] range will result in a lighter color.
+    Returns:
+        modified_color (tuple[double]): a tuple containing the RGB values of the
+            modified color. Each value in the tuple is in the [0.0, 1.0] range.
+    """
+    assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+    color = mplc.to_rgb(color)
+    polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+    modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+    modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+    modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+    modified_color = colorsys.hls_to_rgb(
+        polygon_color[0], modified_lightness, polygon_color[2]
+    )
+    return modified_color

sam3/agent/helpers/visualizer.py ADDED Viewed

	@@ -0,0 +1,1662 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import colorsys
+import logging
+import math
+import random
+from enum import Enum, unique
+import cv2
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from iopath.common.file_io import PathManager
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from PIL import Image
+from .boxes import Boxes, BoxMode
+from .color_map import random_color
+from .keypoints import Keypoints
+from .masks import BitMasks, PolygonMasks
+from .rotated_boxes import RotatedBoxes
+logger = logging.getLogger(__name__)
+__all__ = ["ColorMode", "VisImage", "Visualizer"]
+_SMALL_OBJECT_AREA_THRESH = 1000
+_LARGE_MASK_AREA_THRESH = 120000
+_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
+_BLACK = (0, 0, 0)
+_RED = (1.0, 0, 0)
+_KEYPOINT_THRESHOLD = 0.05
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
+    """
+    IMAGE = 0
+    """
+    Picks a random color for every instance and overlay segmentations with low opacity.
+    """
+    SEGMENTATION = 1
+    """
+    Let instances of the same category have similar colors
+    (from metadata.thing_colors), and overlay them with
+    high opacity. This provides more attention on the quality of segmentation.
+    """
+    IMAGE_BW = 2
+    """
+    Same as IMAGE, but convert all areas without masks to gray-scale.
+    Only available for drawing per-instance mask predictions.
+    """
+class GenericMask:
+    """
+    Attribute:
+        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
+            Each ndarray has format [x, y, x, y, ...]
+        mask (ndarray): a binary mask
+    """
+    def __init__(self, mask_or_polygons, height, width):
+        self._mask = self._polygons = self._has_holes = None
+        self.height = height
+        self.width = width
+        m = mask_or_polygons
+        if isinstance(m, dict):
+            # RLEs
+            assert "counts" in m and "size" in m
+            if isinstance(m["counts"], list):  # uncompressed RLEs
+                h, w = m["size"]
+                assert h == height and w == width
+                m = mask_util.frPyObjects(m, h, w)
+            self._mask = mask_util.decode(m)[:, :]
+            return
+        if isinstance(m, list):  # list[ndarray]
+            self._polygons = [np.asarray(x).reshape(-1) for x in m]
+            return
+        if isinstance(m, np.ndarray):  # assumed to be a binary mask
+            assert m.shape[1] != 2, m.shape
+            assert m.shape == (
+                height,
+                width,
+            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
+            self._mask = m.astype("uint8")
+            return
+        raise ValueError(
+            "GenericMask cannot handle object {} of type '{}'".format(m, type(m))
+        )
+    @property
+    def mask(self):
+        if self._mask is None:
+            self._mask = self.polygons_to_mask(self._polygons)
+        return self._mask
+    @property
+    def polygons(self):
+        if self._polygons is None:
+            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+        return self._polygons
+    @property
+    def has_holes(self):
+        if self._has_holes is None:
+            if self._mask is not None:
+                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+            else:
+                self._has_holes = (
+                    False  # if original format is polygon, does not have holes
+                )
+        return self._has_holes
+    def mask_to_polygons(self, mask):
+        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
+        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
+        # Internal contours (holes) are placed in hierarchy-2.
+        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
+        mask = np.ascontiguousarray(
+            mask
+        )  # some versions of cv2 does not support incontiguous arr
+        res = cv2.findContours(
+            mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE
+        )
+        hierarchy = res[-1]
+        if hierarchy is None:  # empty mask
+            return [], False
+        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
+        res = res[-2]
+        res = [x.flatten() for x in res]
+        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
+        # We add 0.5 to turn them into real-value coordinate space. A better solution
+        # would be to first +0.5 and then dilate the returned polygon by 0.5.
+        res = [x + 0.5 for x in res if len(x) >= 6]
+        return res, has_holes
+    def polygons_to_mask(self, polygons):
+        rle = mask_util.frPyObjects(polygons, self.height, self.width)
+        rle = mask_util.merge(rle)
+        return mask_util.decode(rle)[:, :]
+    def area(self):
+        return self.mask.sum()
+    def bbox(self):
+        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
+        p = mask_util.merge(p)
+        bbox = mask_util.toBbox(p)
+        bbox[2] += bbox[0]
+        bbox[3] += bbox[1]
+        return bbox
+class _PanopticPrediction:
+    """
+    Unify different panoptic annotation/prediction formats
+    """
+    def __init__(self, panoptic_seg, segments_info, metadata=None):
+        if segments_info is None:
+            assert metadata is not None
+            # If "segments_info" is None, we assume "panoptic_img" is a
+            # H*W int32 image storing the panoptic_id in the format of
+            # category_id * label_divisor + instance_id. We reserve -1 for
+            # VOID label.
+            label_divisor = metadata.label_divisor
+            segments_info = []
+            for panoptic_label in np.unique(panoptic_seg.numpy()):
+                if panoptic_label == -1:
+                    # VOID region.
+                    continue
+                pred_class = panoptic_label // label_divisor
+                isthing = (
+                    pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
+                )
+                segments_info.append(
+                    {
+                        "id": int(panoptic_label),
+                        "category_id": int(pred_class),
+                        "isthing": bool(isthing),
+                    }
+                )
+        del metadata
+        self._seg = panoptic_seg
+        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
+        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
+        areas = areas.numpy()
+        sorted_idxs = np.argsort(-areas)
+        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
+        self._seg_ids = self._seg_ids.tolist()
+        for sid, area in zip(self._seg_ids, self._seg_areas):
+            if sid in self._sinfo:
+                self._sinfo[sid]["area"] = float(area)
+    def non_empty_mask(self):
+        """
+        Returns:
+            (H, W) array, a mask for all pixels that have a prediction
+        """
+        empty_ids = []
+        for id in self._seg_ids:
+            if id not in self._sinfo:
+                empty_ids.append(id)
+        if len(empty_ids) == 0:
+            return np.zeros(self._seg.shape, dtype=np.uint8)
+        assert (
+            len(empty_ids) == 1
+        ), ">1 ids corresponds to no labels. This is currently not supported"
+        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
+    def semantic_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or sinfo["isthing"]:
+                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
+                continue
+            yield (self._seg == sid).numpy().astype(np.bool), sinfo
+    def instance_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or not sinfo["isthing"]:
+                continue
+            mask = (self._seg == sid).numpy().astype(np.bool)
+            if mask.sum() > 0:
+                yield mask, sinfo
+def _create_text_labels(classes, scores, class_names, is_crowd=None):
+    """
+    Args:
+        classes (list[int] or None):
+        scores (list[float] or None):
+        class_names (list[str] or None):
+        is_crowd (list[bool] or None):
+    Returns:
+        list[str] or None
+    """
+    labels = None
+    if classes is not None:
+        if class_names is not None and len(class_names) > 0:
+            labels = [class_names[i] for i in classes]
+        else:
+            labels = [str(i) for i in classes]
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    if labels is not None and is_crowd is not None:
+        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
+    return labels
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Args:
+            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+    def _setup_figure(self, img):
+        """
+        Args:
+            Same as in :meth:`__init__()`.
+        Returns:
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+    def reset_image(self, img):
+        """
+        Args:
+            img: same as in __init__
+        """
+        img = img.astype("uint8")
+        self.ax.imshow(
+            img, extent=(0, self.width, self.height, 0), interpolation="nearest"
+        )
+    def save(self, filepath):
+        """
+        Args:
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        self.fig.savefig(filepath)
+    def get_image(self):
+        """
+        Returns:
+            ndarray:
+                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+                The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+        buffer = np.frombuffer(s, dtype="uint8")
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+class Visualizer:
+    """
+    Visualizer that draws data about detection/segmentation on images.
+    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
+    that draw primitive objects to images, as well as high-level wrappers like
+    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
+    that draw composite data in some pre-defined style.
+    Note that the exact visualization style for the high-level wrappers are subject to change.
+    Style such as color, opacity, label contents, visibility of labels, or even the visibility
+    of objects themselves (e.g. when the object is too small) may change according
+    to different heuristics, as long as the results still look visually reasonable.
+    To obtain a consistent style, you can implement custom drawing functions with the
+    abovementioned primitive methods instead. If you need more customized visualization
+    styles, you can process the data yourself following their format documented in
+    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
+    intend to satisfy everyone's preference on drawing styles.
+    This visualizer focuses on high rendering quality rather than performance. It is not
+    designed to be used for real-time applications.
+    """
+    def __init__(
+        self,
+        img_rgb,
+        metadata=None,
+        scale=1.0,
+        instance_mode=ColorMode.IMAGE,
+        font_size_multiplier=1.3,
+        boarder_width_multiplier=1.5,
+    ):
+        """
+        Args:
+            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            metadata (Metadata): dataset metadata (e.g. class names and colors)
+            instance_mode (ColorMode): defines one of the pre-defined style for drawing
+                instances on an image.
+        """
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        self.boarder_width_multiplier = boarder_width_multiplier
+        # if metadata is None:
+        #     metadata = MetadataCatalog.get("__nonexist__")
+        # self.metadata = metadata
+        self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = (
+            max(np.sqrt(self.output.height * self.output.width) // 60, 15 // scale)
+            * font_size_multiplier
+        )
+        # self._default_font_size = 18
+        self._instance_mode = instance_mode
+        self.keypoint_threshold = _KEYPOINT_THRESHOLD
+        import matplotlib.colors as mcolors
+        css4_colors = mcolors.CSS4_COLORS
+        self.color_proposals = [
+            list(mcolors.hex2color(color)) for color in css4_colors.values()
+        ]
+    def draw_instance_predictions(self, predictions):
+        """
+        Draw instance-level prediction results on an image.
+        Args:
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = (
+            predictions.pred_classes.tolist()
+            if predictions.has("pred_classes")
+            else None
+        )
+        labels = _create_text_labels(
+            classes, scores, self.metadata.get("thing_classes", None)
+        )
+        keypoints = (
+            predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+        )
+        keep = (scores > 0.5).cpu()
+        boxes = boxes[keep]
+        scores = scores[keep]
+        classes = np.array(classes)
+        classes = classes[np.array(keep)]
+        labels = np.array(labels)
+        labels = labels[np.array(keep)]
+        if predictions.has("pred_masks"):
+            masks = np.asarray(predictions.pred_masks)
+            masks = masks[np.array(keep)]
+            masks = [
+                GenericMask(x, self.output.height, self.output.width) for x in masks
+            ]
+        else:
+            masks = None
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get(
+            "thing_colors"
+        ):
+            # if self.metadata.get("thing_colors"):
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                for c in classes
+            ]
+            alpha = 0.4
+        else:
+            colors = None
+            alpha = 0.4
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(
+                self._create_grayscale_image(
+                    (predictions.pred_masks.any(dim=0) > 0).numpy()
+                    if predictions.has("pred_masks")
+                    else None
+                )
+            )
+            alpha = 0.3
+        self.overlay_instances(
+            masks=masks,
+            boxes=boxes,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return self.output
+    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.7):
+        """
+        Draw semantic segmentation predictions/labels.
+        Args:
+            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
+                Each value is the integer label of the pixel.
+            area_threshold (int): segments with less than `area_threshold` are not drawn.
+            alpha (float): the larger it is, the more opaque the segmentations are.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        if isinstance(sem_seg, torch.Tensor):
+            sem_seg = sem_seg.numpy()
+        labels, areas = np.unique(sem_seg, return_counts=True)
+        sorted_idxs = np.argsort(-areas).tolist()
+        labels = labels[sorted_idxs]
+        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
+            except (AttributeError, IndexError):
+                mask_color = None
+            binary_mask = (sem_seg == label).astype(np.uint8)
+            text = self.metadata.stuff_classes[label]
+            self.draw_binary_mask(
+                binary_mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+        return self.output
+    def draw_panoptic_seg(
+        self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7
+    ):
+        """
+        Draw panoptic prediction annotations or results.
+        Args:
+            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
+                segment.
+            segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
+                If it is a ``list[dict]``, each dict contains keys "id", "category_id".
+                If None, category id of each pixel is computed by
+                ``pixel // metadata.label_divisor``.
+            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+            text = (
+                self.metadata.stuff_classes[category_idx]
+                .replace("-other", "")
+                .replace("-merged", "")
+            )
+            self.draw_binary_mask(
+                mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+        # draw mask for all instances second
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return self.output
+        masks, sinfo = list(zip(*all_instances))
+        category_ids = [x["category_id"] for x in sinfo]
+        try:
+            scores = [x["score"] for x in sinfo]
+        except KeyError:
+            scores = None
+        class_names = [
+            name.replace("-other", "").replace("-merged", "")
+            for name in self.metadata.thing_classes
+        ]
+        labels = _create_text_labels(
+            category_ids, scores, class_names, [x.get("iscrowd", 0) for x in sinfo]
+        )
+        try:
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                for c in category_ids
+            ]
+        except AttributeError:
+            colors = None
+        self.overlay_instances(
+            masks=masks, labels=labels, assigned_colors=colors, alpha=alpha
+        )
+        return self.output
+    draw_panoptic_seg_predictions = draw_panoptic_seg  # backward compatibility
+    def draw_dataset_dict(self, dic):
+        """
+        Draw annotations/segmentaions in Detectron2 Dataset format.
+        Args:
+            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        annos = dic.get("annotations", None)
+        if annos:
+            if "segmentation" in annos[0]:
+                masks = [x["segmentation"] for x in annos]
+            else:
+                masks = None
+            if "keypoints" in annos[0]:
+                keypts = [x["keypoints"] for x in annos]
+                keypts = np.array(keypts).reshape(len(annos), -1, 3)
+            else:
+                keypts = None
+            boxes = [
+                (
+                    BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
+                    if len(x["bbox"]) == 4
+                    else x["bbox"]
+                )
+                for x in annos
+            ]
+            colors = None
+            category_ids = [x["category_id"] for x in annos]
+            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get(
+                "thing_colors"
+            ):
+                colors = [
+                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                    for c in category_ids
+                ]
+            names = self.metadata.get("thing_classes", None)
+            labels = _create_text_labels(
+                category_ids,
+                scores=None,
+                class_names=names,
+                is_crowd=[x.get("iscrowd", 0) for x in annos],
+            )
+            self.overlay_instances(
+                labels=labels,
+                boxes=boxes,
+                masks=masks,
+                keypoints=keypts,
+                assigned_colors=colors,
+            )
+        sem_seg = dic.get("sem_seg", None)
+        if sem_seg is None and "sem_seg_file_name" in dic:
+            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
+                sem_seg = Image.open(f)
+                sem_seg = np.asarray(sem_seg, dtype="uint8")
+        if sem_seg is not None:
+            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.4)
+        pan_seg = dic.get("pan_seg", None)
+        if pan_seg is None and "pan_seg_file_name" in dic:
+            with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
+                pan_seg = Image.open(f)
+                pan_seg = np.asarray(pan_seg)
+                from panopticapi.utils import rgb2id
+                pan_seg = rgb2id(pan_seg)
+        if pan_seg is not None:
+            segments_info = dic["segments_info"]
+            pan_seg = torch.tensor(pan_seg)
+            self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.7)
+        return self.output
+    def overlay_instances(
+        self,
+        *,
+        boxes=None,
+        labels=None,
+        masks=None,
+        keypoints=None,
+        assigned_colors=None,
+        binary_masks=None,
+        alpha=0.5,
+        label_mode="1",
+    ):
+        """
+        Args:
+            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
+                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+                or a :class:`RotatedBoxes`,
+                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image,
+            labels (list[str]): the text to be displayed for each instance.
+            masks (masks-like object): Supported types are:
+                * :class:`detectron2.structures.PolygonMasks`,
+                  :class:`detectron2.structures.BitMasks`.
+                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
+                  The first level of the list corresponds to individual instances. The second
+                  level to all the polygon that compose the instance, and the third level
+                  to the polygon coordinates. The third level should have the format of
+                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
+                * list[dict]: each dict is a COCO-style RLE.
+            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
+                where the N is the number of instances and K is the number of keypoints.
+                The last dimension corresponds to (x, y, visibility or score).
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = 0
+        if boxes is not None:
+            boxes = self._convert_boxes(boxes)
+            num_instances = len(boxes)
+        if masks is not None:
+            masks = self._convert_masks(masks)
+            if num_instances:
+                assert len(masks) == num_instances
+            else:
+                num_instances = len(masks)
+        if keypoints is not None:
+            if num_instances:
+                assert len(keypoints) == num_instances
+            else:
+                num_instances = len(keypoints)
+            keypoints = self._convert_keypoints(keypoints)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if assigned_colors is None:
+            assigned_colors = [
+                random_color(rgb=True, maximum=1) for _ in range(num_instances)
+            ]
+        if num_instances == 0:
+            return labels, [], []
+        if boxes is not None and boxes.shape[1] == 5:
+            return self.overlay_rotated_instances(
+                boxes=boxes, labels=labels, assigned_colors=assigned_colors
+            )
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+        elif masks is not None:
+            areas = np.asarray([x.area() for x in masks])
+        # if areas is not None:
+        #     # sorted_idxs = np.argsort(areas).tolist()
+        #     sorted_idxs = np.argsort(-areas).tolist()
+        #     # Re-order overlapped instances in descending order.
+        #     boxes = boxes[sorted_idxs] if boxes is not None else None
+        #     labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+        #     masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
+        #     binary_masks = (
+        #         [binary_masks[idx] for idx in sorted_idxs]
+        #         if binary_masks is not None
+        #         else None
+        #     )
+        #     assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+        #     keypoints = keypoints[sorted_idxs] if keypoints is not None else None
+        marks = []
+        marks_position = []
+        added_positions = set()
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], alpha=1, edge_color=color)
+                if binary_masks is None:
+                    # draw number for non-mask instances
+                    mark = self._draw_number_in_box(
+                        boxes[i], i + 1, color=color, label_mode=label_mode
+                    )
+                    marks.append(mark)
+            if binary_masks is not None:
+                mark, mask_position = self._draw_number_in_mask(
+                    binary_mask=binary_masks[i].astype("uint8"),
+                    text=i + 1,
+                    color=color,
+                    added_positions=added_positions,
+                    label_mode=label_mode,
+                )
+                marks.append(mark)
+                marks_position.append(mask_position)
+                self.draw_binary_mask(
+                    binary_masks[i],
+                    color=color,
+                    edge_color=_OFF_WHITE,
+                    alpha=alpha,
+                )
+            if masks is not None:
+                for segment in masks[i].polygons:
+                    self.draw_polygon(
+                        segment.reshape(-1, 2), color, alpha=0
+                    )  # alpha=0 so holes in masks are not colored
+        # draw keypoints
+        if keypoints is not None:
+            for keypoints_per_instance in keypoints:
+                self.draw_and_connect_keypoints(keypoints_per_instance)
+        # return labels, marks, sorted_idxs, marks_position
+        return labels, marks, marks_position
+    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
+        """
+        Args:
+            boxes (ndarray): an Nx5 numpy array of
+                (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image.
+            labels (list[str]): the text to be displayed for each instance.
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = len(boxes)
+        if assigned_colors is None:
+            assigned_colors = [
+                random_color(rgb=True, maximum=1) for _ in range(num_instances)
+            ]
+        if num_instances == 0:
+            return self.output
+        # Display in largest to smallest order to reduce occlusion.
+        if boxes is not None:
+            areas = boxes[:, 2] * boxes[:, 3]
+        sorted_idxs = np.argsort(-areas).tolist()
+        # Re-order overlapped instances in descending order.
+        boxes = boxes[sorted_idxs]
+        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+        colors = [assigned_colors[idx] for idx in sorted_idxs]
+        for i in range(num_instances):
+            self.draw_rotated_box_with_label(
+                boxes[i],
+                edge_color=colors[i],
+                label=labels[i] if labels is not None else None,
+            )
+        return self.output
+    def draw_and_connect_keypoints(self, keypoints):
+        """
+        Draws keypoints of an instance and follows the rules for keypoint connections
+        to draw lines between appropriate keypoints. This follows color heuristics for
+        line color.
+        Args:
+            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
+                and the last dimension corresponds to (x, y, probability).
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        visible = {}
+        keypoint_names = self.metadata.get("keypoint_names")
+        for idx, keypoint in enumerate(keypoints):
+            # draw keypoint
+            x, y, prob = keypoint
+            if prob > self.keypoint_threshold:
+                self.draw_circle((x, y), color=_RED)
+                if keypoint_names:
+                    keypoint_name = keypoint_names[idx]
+                    visible[keypoint_name] = (x, y)
+        if self.metadata.get("keypoint_connection_rules"):
+            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
+                if kp0 in visible and kp1 in visible:
+                    x0, y0 = visible[kp0]
+                    x1, y1 = visible[kp1]
+                    color = tuple(x / 255.0 for x in color)
+                    self.draw_line([x0, x1], [y0, y1], color=color)
+        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
+        # Note that this strategy is specific to person keypoints.
+        # For other keypoints, it should just do nothing
+        try:
+            ls_x, ls_y = visible["left_shoulder"]
+            rs_x, rs_y = visible["right_shoulder"]
+            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
+        except KeyError:
+            pass
+        else:
+            # draw line from nose to mid-shoulder
+            nose_x, nose_y = visible.get("nose", (None, None))
+            if nose_x is not None:
+                self.draw_line(
+                    [nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED
+                )
+            try:
+                # draw line from mid-shoulder to mid-hip
+                lh_x, lh_y = visible["left_hip"]
+                rh_x, rh_y = visible["right_hip"]
+            except KeyError:
+                pass
+            else:
+                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
+                self.draw_line(
+                    [mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED
+                )
+        return self.output
+    def mask_dims_from_binary(self, binary_mask):
+        ind_y, ind_x = np.where(binary_mask == 1)
+        min_ind_x = np.min(ind_x)
+        max_ind_x = np.max(ind_x)
+        min_ind_y = np.min(ind_y)
+        max_ind_y = np.max(ind_y)
+        return (max_ind_x - min_ind_x), (max_ind_y - min_ind_y)
+    def reposition_label(self, position, cur, binary_mask, move_count):
+        img_width, img_height = self.output.width, self.output.height
+        mask_width, mask_height = self.mask_dims_from_binary(binary_mask)
+        # set resposition thresholds
+        mask_width_limit, mask_height_limit = (
+            25,
+            25,
+        )  # limit for width and height size for object covering
+        location_diff_threshold = 15  # limit for the distance between two labels
+        x_boundry_limit, y_boundry_limit = (
+            20,
+            20,
+        )  # limit for the distancing the label from edges
+        offset_x = 15  # move in x direction
+        offset_y = 15  # move in y direction
+        x1, y1 = position
+        if (
+            mask_width < mask_width_limit
+            and mask_height < mask_height_limit
+            and move_count == 0
+        ):
+            move_x = offset_x if offset_x + x1 < img_width else -offset_x
+            move_y = offset_y if offset_y + y1 < img_height else -offset_y
+            return (True, move_x, move_y)
+        for x2, y2 in cur:
+            if abs(x1 - x2) + abs(y1 - y2) < location_diff_threshold:
+                move_x = offset_x if x1 >= x2 else -offset_x
+                move_y = offset_y if y1 >= y2 else -offset_y
+                move_x = (
+                    0
+                    if x1 + move_x > img_width - x_boundry_limit
+                    or x1 + move_x < x_boundry_limit
+                    else move_x
+                )
+                move_y = (
+                    0
+                    if y1 + move_y > img_height - y_boundry_limit
+                    or y1 + move_y < y_boundry_limit
+                    else move_y
+                )
+                return (
+                    True,
+                    move_x,
+                    move_y,
+                )
+        return (False, 0, 0)
+    def locate_label_position(self, original_position, added_positions, binary_mask):
+        if added_positions is None or binary_mask is None:
+            return original_position
+        x, y = original_position
+        move_count = 0
+        reposition, x_move, y_move = self.reposition_label(
+            (x, y), added_positions, binary_mask, move_count
+        )
+        while reposition and move_count < 10:
+            x += x_move
+            y += y_move
+            move_count += 1
+            reposition, x_move, y_move = self.reposition_label(
+                (x, y), added_positions, binary_mask, move_count
+            )
+        added_positions.add((x, y))
+        return x, y
+    """
+    Primitive drawing functions:
+    """
+    def draw_text(
+        self,
+        text,
+        position,
+        added_positions=None,
+        binary_mask=None,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0,
+    ):
+        """
+        Args:
+            text (str): class label
+            position (tuple): a tuple of the x and y coordinates to place text on image.
+            font_size (int, optional): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color: color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            horizontal_alignment (str): see `matplotlib.text.Text`
+            rotation: rotation angle in degrees CCW
+        Returns:
+            output (VisImage): image object with text drawn.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.15)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+        def contrasting_color(rgb):
+            """Returns 'white' or 'black' depending on which color contrasts more with the given RGB value."""
+            # Decompose the RGB tuple
+            R, G, B = rgb
+            # Calculate the Y value
+            Y = 0.299 * R + 0.587 * G + 0.114 * B
+            # If Y value is greater than 128, it's closer to white so return black. Otherwise, return white.
+            return "black" if Y > 128 else "white"
+        bbox_background = contrasting_color(color * 255)
+        x, y = self.locate_label_position(
+            original_position=position,
+            added_positions=added_positions,
+            binary_mask=binary_mask,
+        )
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="sans-serif",
+            bbox={
+                "facecolor": bbox_background,
+                "alpha": 0.8,
+                "pad": 0.7,
+                "edgecolor": "none",
+            },
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        """
+        Args:
+            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+                are the coordinates of the image's top left corner. x1 and y1 are the
+                coordinates of the image's bottom right corner.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+        linewidth = max(self._default_font_size / 12, 1) * self.boarder_width_multiplier
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+    def draw_rotated_box_with_label(
+        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
+    ):
+        """
+        Draw a rotated box with label on its top-left corner.
+        Args:
+            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
+                where cnt_x and cnt_y are the center coordinates of the box.
+                w and h are the width and height of the box. angle represents how
+                many degrees the box is rotated CCW with regard to the 0-degree box.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+            label (string): label for rotated box. It will not be rendered when set to None.
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        cnt_x, cnt_y, w, h, angle = rotated_box
+        area = w * h
+        # use thinner lines when the box is small
+        linewidth = self._default_font_size / (
+            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
+        )
+        theta = angle * math.pi / 180.0
+        c = math.cos(theta)
+        s = math.sin(theta)
+        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
+        # x: left->right ; y: top->down
+        rotated_rect = [
+            (s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect
+        ]
+        for k in range(4):
+            j = (k + 1) % 4
+            self.draw_line(
+                [rotated_rect[k][0], rotated_rect[j][0]],
+                [rotated_rect[k][1], rotated_rect[j][1]],
+                color=edge_color,
+                linestyle="--" if k == 1 else line_style,
+                linewidth=linewidth,
+            )
+        if label is not None:
+            text_pos = rotated_rect[1]  # topleft corner
+            height_ratio = h / np.sqrt(self.output.height * self.output.width)
+            label_color = self._change_color_brightness(
+                edge_color, brightness_factor=0.7
+            )
+            font_size = (
+                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+                * 0.5
+                * self._default_font_size
+            )
+            self.draw_text(
+                label, text_pos, color=label_color, font_size=font_size, rotation=angle
+            )
+        return self.output
+    def draw_circle(self, circle_coord, color, radius=3):
+        """
+        Args:
+            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
+                of the center of the circle.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            radius (int): radius of the circle.
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x, y = circle_coord
+        self.output.ax.add_patch(
+            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
+        )
+        return self.output
+    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
+        """
+        Args:
+            x_data (list[int]): a list containing x values of all the points being drawn.
+                Length of list should match the length of y_data.
+            y_data (list[int]): a list containing y values of all the points being drawn.
+                Length of list should match the length of x_data.
+            color: color of the line. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
+                for a full list of formats that are accepted.
+            linewidth (float or None): width of the line. When it's None,
+                a default value will be computed and used.
+        Returns:
+            output (VisImage): image object with line drawn.
+        """
+        if linewidth is None:
+            linewidth = self._default_font_size / 3
+        linewidth = max(linewidth, 1)
+        self.output.ax.add_line(
+            mpl.lines.Line2D(
+                x_data,
+                y_data,
+                linewidth=linewidth * self.output.scale,
+                color=color,
+                linestyle=linestyle,
+            )
+        )
+        return self.output
+    def draw_binary_mask(
+        self,
+        binary_mask,
+        color=None,
+        *,
+        edge_color=None,
+        text=None,
+        alpha=0.7,
+        area_threshold=10,
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component smaller than this area will not be shown.
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+        has_valid_segment = False
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+        if not mask.has_holes:
+            # draw polygons for regular masks
+            for segment in mask.polygons:
+                area = mask_util.area(
+                    mask_util.frPyObjects([segment], shape2d[0], shape2d[1])
+                )
+                if area < (area_threshold or 0):
+                    continue
+                has_valid_segment = True
+                segment = segment.reshape(-1, 2)
+                self.draw_polygon(
+                    segment, color=color, edge_color=edge_color, alpha=alpha
+                )
+        else:
+            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
+            rgba = np.zeros(shape2d + (4,), dtype="float32")
+            rgba[:, :, :3] = color
+            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+            has_valid_segment = True
+            self.output.ax.imshow(
+                rgba, extent=(0, self.output.width, self.output.height, 0)
+            )
+        if text is not None and has_valid_segment:
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            self._draw_text_in_mask(binary_mask, text, lighter_color)
+        return self.output
+    def draw_binary_mask_with_number(
+        self,
+        binary_mask,
+        color=None,
+        *,
+        edge_color=None,
+        text=None,
+        label_mode="1",
+        alpha=0.1,
+        anno_mode=["Mask"],
+        area_threshold=10,
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component smaller than this area will not be shown.
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            randint = random.randint(0, len(self.color_proposals) - 1)
+            color = self.color_proposals[randint]
+        color = mplc.to_rgb(color)
+        has_valid_segment = True
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+        bbox = mask.bbox()
+        if "Mask" in anno_mode:
+            if not mask.has_holes:
+                # draw polygons for regular masks
+                for segment in mask.polygons:
+                    area = mask_util.area(
+                        mask_util.frPyObjects([segment], shape2d[0], shape2d[1])
+                    )
+                    if area < (area_threshold or 0):
+                        continue
+                    has_valid_segment = True
+                    segment = segment.reshape(-1, 2)
+                    self.draw_polygon(
+                        segment, color=color, edge_color=edge_color, alpha=alpha
+                    )
+            else:
+                # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
+                rgba = np.zeros(shape2d + (4,), dtype="float32")
+                rgba[:, :, :3] = color
+                rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+                has_valid_segment = True
+                self.output.ax.imshow(
+                    rgba, extent=(0, self.output.width, self.output.height, 0)
+                )
+        if "Box" in anno_mode:
+            self.draw_box(bbox, edge_color=color, alpha=0.75)
+        if "Mark" in anno_mode:
+            has_valid_segment = True
+        else:
+            has_valid_segment = False
+        if text is not None and has_valid_segment:
+            # lighter_color = tuple([x*0.2 for x in color])
+            lighter_color = [
+                1,
+                1,
+                1,
+            ]  # self._change_color_brightness(color, brightness_factor=0.7)
+            self._draw_number_in_mask(
+                binary_mask=binary_mask,
+                text=text,
+                color=lighter_color,
+                label_mode=label_mode,
+            )
+        return self.output
+    def draw_soft_mask(self, soft_mask, color=None, *, text=None, alpha=0.5):
+        """
+        Args:
+            soft_mask (ndarray): float array of shape (H, W), each value in [0, 1].
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+        shape2d = (soft_mask.shape[0], soft_mask.shape[1])
+        rgba = np.zeros(shape2d + (4,), dtype="float32")
+        rgba[:, :, :3] = color
+        rgba[:, :, 3] = soft_mask * alpha
+        self.output.ax.imshow(
+            rgba, extent=(0, self.output.width, self.output.height, 0)
+        )
+        if text is not None:
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            binary_mask = (soft_mask > 0.5).astype("uint8")
+            self._draw_text_in_mask(binary_mask, text, lighter_color)
+        return self.output
+    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
+        """
+        Args:
+            segment: numpy array of shape Nx2, containing all the points in the polygon.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted. If not provided, a darker shade
+                of the polygon color will be used instead.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+        Returns:
+            output (VisImage): image object with polygon drawn.
+        """
+        if edge_color is None:
+            # make edge color darker than the polygon color
+            if alpha > 0.8:
+                edge_color = self._change_color_brightness(
+                    color, brightness_factor=-0.7
+                )
+            else:
+                edge_color = color
+        edge_color = mplc.to_rgb(edge_color) + (1,)
+        polygon = mpl.patches.Polygon(
+            segment,
+            fill=True,
+            facecolor=mplc.to_rgb(color) + (alpha,),
+            edgecolor=edge_color,
+            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
+        )
+        self.output.ax.add_patch(polygon)
+        return self.output
+    """
+    Internal methods:
+    """
+    def _jitter(self, color):
+        """
+        Randomly modifies given color to produce a slightly different color than the color given.
+        Args:
+            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
+                picked. The values in the list are in the [0.0, 1.0] range.
+        Returns:
+            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
+                color after being jittered. The values in the list are in the [0.0, 1.0] range.
+        """
+        color = mplc.to_rgb(color)
+        # np.random.seed(0)
+        vec = np.random.rand(3)
+        # better to do it in another color space
+        vec = vec / np.linalg.norm(vec) * 0.5
+        res = np.clip(vec + color, 0, 1)
+        return tuple(res)
+    def _create_grayscale_image(self, mask=None):
+        """
+        Create a grayscale version of the original image.
+        The colors in masked area, if given, will be kept.
+        """
+        img_bw = self.img.astype("f4").mean(axis=2)
+        img_bw = np.stack([img_bw] * 3, axis=2)
+        if mask is not None:
+            img_bw[mask] = self.img[mask]
+        return img_bw
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(
+            polygon_color[0], modified_lightness, polygon_color[2]
+        )
+        return modified_color
+    def _convert_boxes(self, boxes):
+        """
+        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
+        """
+        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
+            return boxes.tensor.detach().numpy()
+        else:
+            return np.asarray(boxes)
+    def _convert_masks(self, masks_or_polygons):
+        """
+        Convert different format of masks or polygons to a tuple of masks and polygons.
+        Returns:
+            list[GenericMask]:
+        """
+        m = masks_or_polygons
+        if isinstance(m, PolygonMasks):
+            m = m.polygons
+        if isinstance(m, BitMasks):
+            m = m.tensor.numpy()
+        if isinstance(m, torch.Tensor):
+            m = m.numpy()
+        ret = []
+        for x in m:
+            if isinstance(x, GenericMask):
+                ret.append(x)
+            else:
+                ret.append(GenericMask(x, self.output.height, self.output.width))
+        return ret
+    def _draw_number_in_box(self, box, text, color, label_mode="1"):
+        """
+        Find proper places to draw text given a box.
+        """
+        x0, y0, x1, y1 = box
+        text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+        horiz_align = "left"
+        # for small objects, draw text at the side to avoid occlusion
+        instance_area = (y1 - y0) * (x1 - x0)
+        if (
+            instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
+            or y1 - y0 < 40 * self.output.scale
+        ):
+            if y1 >= self.output.height - 5:
+                text_pos = (x1, y0)
+            else:
+                text_pos = (x0, y1)
+        height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+        lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+        font_size = (
+            np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+            * 0.65
+            * self._default_font_size
+        )
+        if label_mode == "a":
+            text = self.number_to_string(int(text))
+        else:
+            text = text
+        self.draw_text(
+            text,
+            text_pos,
+            color=lighter_color,
+            horizontal_alignment=horiz_align,
+            font_size=font_size,
+        )
+        return str(text)
+    @staticmethod
+    def number_to_string(n):
+        chars = []
+        while n:
+            n, remainder = divmod(n - 1, 26)
+            chars.append(chr(97 + remainder))
+        return "".join(reversed(chars))
+    def _draw_number_in_mask(
+        self, binary_mask, text, color, added_positions=None, label_mode="1"
+    ):
+        """
+        Find proper places to draw text given a binary mask.
+        """
+        binary_mask = np.pad(binary_mask, ((1, 1), (1, 1)), "constant")
+        mask_dt = cv2.distanceTransform(binary_mask, cv2.DIST_L2, 0)
+        mask_dt = mask_dt[1:-1, 1:-1]
+        max_dist = np.max(mask_dt)
+        coords_y, coords_x = np.where(mask_dt == max_dist)  # coords is [y, x]
+        if label_mode == "a":
+            text = self.number_to_string(int(text))
+        else:
+            text = text
+        text_position = (
+            coords_x[len(coords_x) // 2] + 2,
+            coords_y[len(coords_y) // 2] - 6,
+        )
+        self.draw_text(
+            text,
+            text_position,
+            added_positions=added_positions,
+            binary_mask=binary_mask,
+            color=color,
+        )
+        return str(text), text_position
+        # _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
+        # if stats[1:, -1].size == 0:
+        #     return
+        # largest_component_id = np.argmax(stats[1:, -1]) + 1
+        # # draw text on the largest component, as well as other very large components.
+        # for cid in range(1, _num_cc):
+        #     if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+        #         # median is more stable than centroid
+        #         # center = centroids[largest_component_id]
+        #         center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+        #         # bottom=np.max((cc_labels == cid).nonzero(), axis=1)[::-1]
+        #         # center[1]=bottom[1]+2
+        #         self.draw_text(text, center, color=color)
+    def _draw_text_in_mask(self, binary_mask, text, color):
+        """
+        Find proper places to draw text given a binary mask.
+        """
+        _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(
+            binary_mask, 8
+        )
+        if stats[1:, -1].size == 0:
+            return
+        largest_component_id = np.argmax(stats[1:, -1]) + 1
+        # draw text on the largest component, as well as other very large components.
+        for cid in range(1, _num_cc):
+            if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+                # median is more stable than centroid
+                # center = centroids[largest_component_id]
+                center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+                bottom = np.max((cc_labels == cid).nonzero(), axis=1)[::-1]
+                center[1] = bottom[1] + 2
+                self.draw_text(text, center, color=color)
+    def _convert_keypoints(self, keypoints):
+        if isinstance(keypoints, Keypoints):
+            keypoints = keypoints.tensor
+        keypoints = np.asarray(keypoints)
+        return keypoints
+    def get_output(self):
+        """
+        Returns:
+            output (VisImage): the image output containing the visualizations added
+            to the image.
+        """
+        return self.output

sam3/agent/helpers/zoom_in.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import io
+import math
+import matplotlib.pyplot as plt
+import numpy as np
+import pycocotools.mask as mask_utils
+from PIL import Image
+from .som_utils import ColorPalette, draw_box, draw_mask, draw_text
+def render_zoom_in(
+    object_data,
+    image_file,
+    show_box: bool = True,
+    show_text: bool = False,
+    show_holes: bool = True,
+    mask_alpha: float = 0.15,
+):
+    """
+    Render a two-panel visualization with a cropped original view (left/upper) and a zoomed-in
+    mask overlay (right/lower), then return it as a PIL.Image along with the chosen mask color (hex).
+    Parameters
+    ----------
+    object_data : dict
+        Dict containing "labels" and COCO RLE "segmentation".
+        Expected:
+          object_data["labels"][0]["noun_phrase"] : str
+          object_data["segmentation"] : COCO RLE (with "size": [H, W])
+    image_file : PIL.Image.Image
+        Source image (PIL).
+    show_box : bool
+        Whether to draw the bbox on the cropped original panel.
+    show_text : bool
+        Whether to draw the noun phrase label near the bbox.
+    show_holes : bool
+        Whether to render mask holes (passed through to draw_mask).
+    mask_alpha : float
+        Alpha for the mask overlay.
+    Returns
+    -------
+    pil_img : PIL.Image.Image
+        The composed visualization image.
+    color_hex : str
+        Hex string of the chosen mask color.
+    """
+    # ---- local constants (avoid module-level globals) ----
+    _AREA_LARGE = 0.25
+    _AREA_MEDIUM = 0.05
+    # ---- local helpers (avoid name collisions in a larger class) ----
+    def _get_shift(x, w, w_new, w_img):
+        assert 0 <= w_new <= w_img
+        shift = (w_new - w) / 2
+        if x - shift + w_new > w_img:
+            shift = x + w_new - w_img
+        return min(x, shift)
+    def _get_zoom_in_box(mask_box_xywh, img_h, img_w, mask_area):
+        box_w, box_h = mask_box_xywh[2], mask_box_xywh[3]
+        w_new = min(box_w + max(0.2 * box_w, 16), img_w)
+        h_new = min(box_h + max(0.2 * box_h, 16), img_h)
+        mask_relative_area = mask_area / (w_new * h_new)
+        # zoom-in (larger box if mask is relatively big)
+        w_new_large, h_new_large = w_new, h_new
+        if mask_relative_area > _AREA_LARGE:
+            ratio_large = math.sqrt(mask_relative_area / _AREA_LARGE)
+            w_new_large = min(w_new * ratio_large, img_w)
+            h_new_large = min(h_new * ratio_large, img_h)
+        w_shift_large = _get_shift(
+            mask_box_xywh[0], mask_box_xywh[2], w_new_large, img_w
+        )
+        h_shift_large = _get_shift(
+            mask_box_xywh[1], mask_box_xywh[3], h_new_large, img_h
+        )
+        zoom_in_box = [
+            mask_box_xywh[0] - w_shift_large,
+            mask_box_xywh[1] - h_shift_large,
+            w_new_large,
+            h_new_large,
+        ]
+        # crop box for the original/cropped image
+        w_new_medium, h_new_medium = w_new, h_new
+        if mask_relative_area > _AREA_MEDIUM:
+            ratio_med = math.sqrt(mask_relative_area / _AREA_MEDIUM)
+            w_new_medium = min(w_new * ratio_med, img_w)
+            h_new_medium = min(h_new * ratio_med, img_h)
+        w_shift_medium = _get_shift(
+            mask_box_xywh[0], mask_box_xywh[2], w_new_medium, img_w
+        )
+        h_shift_medium = _get_shift(
+            mask_box_xywh[1], mask_box_xywh[3], h_new_medium, img_h
+        )
+        img_crop_box = [
+            mask_box_xywh[0] - w_shift_medium,
+            mask_box_xywh[1] - h_shift_medium,
+            w_new_medium,
+            h_new_medium,
+        ]
+        return zoom_in_box, img_crop_box
+    # ---- main body ----
+    # Input parsing
+    object_label = object_data["labels"][0]["noun_phrase"]
+    img = image_file.convert("RGB")
+    bbox_xywh = mask_utils.toBbox(object_data["segmentation"])  # [x, y, w, h]
+    # Choose a stable, visually distant color based on crop
+    bbox_xyxy = [
+        bbox_xywh[0],
+        bbox_xywh[1],
+        bbox_xywh[0] + bbox_xywh[2],
+        bbox_xywh[1] + bbox_xywh[3],
+    ]
+    crop_img = img.crop(bbox_xyxy)
+    color_palette = ColorPalette.default()
+    color_obj, _ = color_palette.find_farthest_color(np.array(crop_img))
+    color = np.array([color_obj.r / 255, color_obj.g / 255, color_obj.b / 255])
+    color_hex = f"#{color_obj.r:02x}{color_obj.g:02x}{color_obj.b:02x}"
+    # Compute zoom-in / crop boxes
+    img_h, img_w = object_data["segmentation"]["size"]
+    mask_area = mask_utils.area(object_data["segmentation"])
+    zoom_in_box, img_crop_box = _get_zoom_in_box(bbox_xywh, img_h, img_w, mask_area)
+    # Layout choice
+    w, h = img_crop_box[2], img_crop_box[3]
+    if w < h:
+        fig, (ax1, ax2) = plt.subplots(1, 2)
+    else:
+        fig, (ax1, ax2) = plt.subplots(2, 1)
+    # Panel 1: cropped original with optional box/text
+    img_crop_box_xyxy = [
+        img_crop_box[0],
+        img_crop_box[1],
+        img_crop_box[0] + img_crop_box[2],
+        img_crop_box[1] + img_crop_box[3],
+    ]
+    img1 = img.crop(img_crop_box_xyxy)
+    bbox_xywh_rel = [
+        bbox_xywh[0] - img_crop_box[0],
+        bbox_xywh[1] - img_crop_box[1],
+        bbox_xywh[2],
+        bbox_xywh[3],
+    ]
+    ax1.imshow(img1)
+    ax1.axis("off")
+    if show_box:
+        draw_box(ax1, bbox_xywh_rel, edge_color=color)
+    if show_text:
+        x0, y0 = bbox_xywh_rel[0] + 2, bbox_xywh_rel[1] + 2
+        draw_text(ax1, object_label, [x0, y0], color=color)
+    # Panel 2: zoomed-in mask overlay
+    binary_mask = mask_utils.decode(object_data["segmentation"])
+    alpha = Image.fromarray((binary_mask * 255).astype("uint8"))
+    img_rgba = img.convert("RGBA")
+    img_rgba.putalpha(alpha)
+    zoom_in_box_xyxy = [
+        zoom_in_box[0],
+        zoom_in_box[1],
+        zoom_in_box[0] + zoom_in_box[2],
+        zoom_in_box[1] + zoom_in_box[3],
+    ]
+    img_with_alpha_zoomin = img_rgba.crop(zoom_in_box_xyxy)
+    alpha_zoomin = img_with_alpha_zoomin.split()[3]
+    binary_mask_zoomin = np.array(alpha_zoomin).astype(bool)
+    ax2.imshow(img_with_alpha_zoomin.convert("RGB"))
+    ax2.axis("off")
+    draw_mask(
+        ax2, binary_mask_zoomin, color=color, show_holes=show_holes, alpha=mask_alpha
+    )
+    plt.tight_layout()
+    # Buffer -> PIL.Image
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0, dpi=100)
+    plt.close(fig)
+    buf.seek(0)
+    pil_img = Image.open(buf)
+    return pil_img, color_hex

sam3/agent/inference.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import json
+import os
+from sam3.agent.agent_core import agent_inference
+def run_single_image_inference(
+    image_path,
+    text_prompt,
+    llm_config,
+    send_generate_request,
+    call_sam_service,
+    output_dir="agent_output",
+    debug=False,
+):
+    """Run inference on a single image with provided prompt"""
+    llm_name = llm_config["name"]
+    if not os.path.exists(image_path):
+        raise FileNotFoundError(f"Image file not found: {image_path}")
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Generate output file names
+    image_basename = os.path.splitext(os.path.basename(image_path))[0]
+    prompt_for_filename = text_prompt.replace("/", "_").replace(" ", "_")
+    base_filename = f"{image_basename}_{prompt_for_filename}_agent_{llm_name}"
+    output_json_path = os.path.join(output_dir, f"{base_filename}_pred.json")
+    output_image_path = os.path.join(output_dir, f"{base_filename}_pred.png")
+    agent_history_path = os.path.join(output_dir, f"{base_filename}_history.json")
+    # Check if output already exists and skip
+    if os.path.exists(output_json_path):
+        print(f"Output JSON {output_json_path} already exists. Skipping.")
+        return
+    print(f"{'-'*30} Starting SAM 3 Agent Session... {'-'*30} ")
+    agent_history, final_output_dict, rendered_final_output = agent_inference(
+        image_path,
+        text_prompt,
+        send_generate_request=send_generate_request,
+        call_sam_service=call_sam_service,
+        output_dir=output_dir,
+        debug=debug,
+    )
+    print(f"{'-'*30} End of SAM 3 Agent Session... {'-'*30} ")
+    final_output_dict["text_prompt"] = text_prompt
+    final_output_dict["image_path"] = image_path
+    # Save outputs
+    json.dump(final_output_dict, open(output_json_path, "w"), indent=4)
+    json.dump(agent_history, open(agent_history_path, "w"), indent=4)
+    rendered_final_output.save(output_image_path)
+    print(f"\n✅ Successfully processed single image!")
+    print(f"Output JSON: {output_json_path}")
+    print(f"Output Image: {output_image_path}")
+    print(f"Agent History: {agent_history_path}")
+    return output_image_path

sam3/agent/system_prompts/system_prompt.txt ADDED Viewed

	@@ -0,0 +1,242 @@

+You are a helpful visual-concept grounding assistant capable of leveraging tool calls to ground concepts the user refers to, and providing structured JSON outputs and tool calls.
+The user may provide you with a referring expression that matches some part(s) of the image, or a question whose answer points to some part(s) of the image.
+You should observe and analyze the image along with the initial user input query very carefully, note all details in the image, think about what the user is actually referring to, how to leverage existing tools below to ground the target(s), and then call exactly one tool per turn.
+At each turn, all available mask(s) will be renumbered and re-rendered on the most recent image provided to you. The numbering and coloring can be different from previous turns. You should only refer to mask(s) rendered on the most recent image using their currently assigned number.
+If a tool call does not produce the intended output, do not give up; be creative and try calling the segment_phrase tool again with different parameters, or try a different tool. You may take as many turns as needed, but you must call exactly one tool per turn and then immediately stop. There is no need to rush to find a solution in the current turn, so take your time!
+How you should understand the initial user input query and the raw input image:
+1. If there are multiple instances of the target object class in the image, you should read the initial user input query very carefully and think about whether the initial user input query applies broadly to all the instances or just one specific instance, and ground accordingly.
+2. You should think carefully and find the actual target object(s) the user is asking you to ground. Never call the segment_phrase tool to ground secondary object(s) in the initial user input query that only exist to help you identify the actual target. For example, given the initial user input query 'a giraffe with its head up', you should ground the whole 'giraffe' and not 'the head of the giraffe'. Given the initial user input query 'a person holding a blender with their left hand', you should ground 'person' instead of 'blender' or 'left hand'. Given the initial user input query 'two lovely ladies conversing while walking a dog, behind a bicycle', you should ground 'woman' instead of 'dog' or 'bicycle'. Given the initial user input query "guy with white hat", you should ground the "guy" and not the "white hat".
+3. Sometimes the user will mention or use non-target object(s) in their description to help identify the target object(s), you must make sure not to include mask(s) for those object(s) that are only used for identification purposes. For example, given the initial user input query "a man carrying a young girl", you should only ground the main target the "man" and not include the "young girl" in your final predicted mask(s). Given the initial user input query "a small girl staring at something, along with her older sister", you should only ground the "small girl" and not include her "older sister" in your final predicted mask(s).
+4. Sometimes the target object(s) are not directly named in the description but are clearly referenced, in which case you should focus only on grounding the clearly referenced target object(s). For example, given the initial user input query "something that shows the man is playing golf" and an image of a man holding a golf club, you should ground the phrase "golf club" and not the phrase "man" even though "golf club" is not directly named in the initial user input query.
+5. You must carefully examine all details in the raw input image and note them in your thinking, and reason step-by-step to determine if anything in the image could potentially match the initial user input query. You should not give up the grounding process and call the report_no_mask tool due to very small technicalities or small literal discrepancies. For example, if the user asks you to find a dry space, relatively dry areas like land would satisfy the constraint. If the user asks you to find object(s) that help you focus, headphones and even window shades could potentially serve the purpose. If the user asks you to find containers that can be used for holding hot water, cups or kettles can both work. You should only call the report_no_mask tool if there are very direct contradictions and/or hard constraints in the initial user input query that cause all objects in the raw input image to be invalid matches for the initial user input query.
+6. Sometimes the initial user input query can be slightly wrong but still very much related to the image. For example, the user may ask you to ground "the red laptop" when the laptop computer in the image is purple (in this case you should call segment_phrase on the "text_prompt" "purple laptop computer"); or the user may ask you to ground "girl left" when there is no girl on the left of the image but rather a woman on the left of the image (in this case you should call segment_phrase to ground the phrase "left woman"). In these cases, you should accommodate the user errors and still ground the object(s) in the image that best match the initial user input query. You may slightly modify the initial user input query based on your observation of the original image to better match the user’s intent.
+7. Sometimes the initial user input query may be grammatically incorrect, contain typos, or contain irrelevant information. In these cases, you should not blindly try to ground part(s) of the initial user input query using segment_phrase. Instead, you should reason step by step to think about what the user is actually referring to, and then modify the initial user input query based on your understanding and careful analysis of the raw input image. For example, you may see an initial user input query like "left back to us guy", which you can interpret as the man on the left who is facing the other direction (if you can see such a man exists in the image), and then call segment_phrase on "man" and then select the correct mask. You may also see an initial user input query like "big maybe hotdog middle back taste good", and there are just nine sandwiches in the image placed in three rows, then you can probably infer that the user is trying to ground the sandwich in the middle of the back row. You can then call segment_phrase to ground the phrase "sandwich" and use the select_masks_and_return tool to accurately choose only the sandwich in the middle of the back row in your "final_answer_masks" array.
+8. The correct "final_answer_masks" array should never contain any mask(s) whose number is greater than 100. For example, you may never select mask 102 or mask 114 in your "final_answer_masks" array. This also means that you are never allowed to select more than 100 masks in your "final_answer_masks" array.
+9. Please note that if the raw input image is composed of two individual sub-images concatenated visually; it still counts as only one image. If you find that there are "two" images in the chat context but the "second image" is not the same as the first image overlaid with numbered segmentation masks, this means that the "second image" is actually just a sub-image of the raw input image concatenated with the "first image" to serve as a combined raw input image. In this case, there is actually only one image in the chat context and you should follow the Scenario 1 instructions. This is very important!
+You should always follow the response format defined below and complete the Steps for Each Turn as specified below. Never break the specified format for any reason.
+Available tools:
+segment_phrase: Use the experimental Segment Anything 3 model to ground all instances of a simple noun phrase by generating segmentation mask(s) that cover those instances on the raw input image. At the same time, all previously generated mask(s) will be deleted and cannot be referred to in future messages.
+Use cases: "Given a simple, direct, and singular noun phrase (not a referring expression that requires additional understanding/reasoning), segment_phrase will try to locate all object instance(s) on the raw input image that match the simple noun phrase you provided. The tool will also render all of the generated segmentation mask(s) onto the image for you to examine and decide the next step."
+Parameters for segment_phrase: {"type": "object", "properties": {"text_prompt": {"type": "string", "description": "A short and simple noun phrase, e.g., rope, bird beak, speed monitor, brown handbag, person torso"}}, "required": ["text_prompt"]}
+Return type: A new image with differently colored segmentation mask(s) rendered on it, and a text message indicating the number of mask(s) generated by the experimental Segment Anything 3 model for this "text_prompt" only.
+Important rules for using the segment_phrase tool:
+1. You may use visual adjectives such as color to help identify the concept you want to ground, but do not use complicated descriptors like numbers or mention text that is written on the image as the segment_phrase tool does not have OCR capabilities. For example, use "black ball" instead of "8-ball" to ground a black ball with the number "8" written on it. If the user asks you to ground an object that can only be identified by the text or number written on it, you should generate mask(s) for all object(s) of that category and then cross-examine the original image against the masked image carefully to locate the exact mask(s) that match or answer the initial user input query and select only those mask(s).
+2. Do not try to directly ground words, letters, or numbers in written text on the image. For example, if there is text on a sign to ground, you should use "sign" as your "text_prompt" instead of using the actual text itself as your "text_prompt".
+3. If your call to segment_phrase does not generate any useful mask(s) or if the mask(s) are incomplete, you may want to try calling the segment_phrase tool again using a more general noun phrase. For example, if the "text_prompt" "elementary school teacher" does not give you any mask(s), you can call segment_phrase again with the "text_prompt": "person".
+4. You should avoid identifying concepts using actions, relationships, or comparatives; instead, call segment_phrase on a more general phrase and let the segment_phrase tool generate more mask(s) than you need. Then, in the next turn, you can use the select_masks_and_return tool to remove some mask(s). For example, use "vase" instead of "the bigger vase", use "dog" instead of "the dog lying down", and use "brown pillow" instead of "the pillow on the chair".
+5. If the results of segment_phrase are not what you expected, you can always call segment_phrase again using a different "text_prompt". For example, when grounding a dog's nose, you can try "dog nose" and "black marking" after "nose" does not work.
+6. Sometimes when the target object(s) are too niche and the segment_phrase tool does not provide any mask(s), you may want to try grounding a more general version of the object. For example, when "sundial" does not produce any mask(s), you can try grounding "statue".
+7. Be concise and get the right keywords; don't make your "text_prompt" long.
+8. Do not ever use the exact same "text_prompt" more than once. This is very important!
+9. Sometimes you may find that the user is referring to a person or some people as the main grounding target. In this case, you should absolutely avoid grounding identifying part(s) or attribute(s) of the person or people, even if these part(s) or component(s) are explicitly mentioned in the initial user input query. Instead, you should only call segment_phrase with general "text_prompt"s like "person", "man", "girl", "firefighter", etc. that refer to the person as a whole. Later you can refer back to these identifying part(s) or attribute(s) and look closely at the original image to help you select the correct mask(s).
+10. If a previously used "text_prompt" does not work, avoid using it again and think of a new, creative "text_prompt" that may be indirect but can achieve the target result. For example, when grounding the center of the cake with text written on it, try grounding "birthday greeting" instead.
+11. You should always call segment_phrase with a "text_prompt" that represents the entire grounding target to generate mask(s) that you can choose from (sometimes along with other entities of the same category if it is hard to avoid). Do not call segment_phrase with a "text_prompt" that refers to subpart(s) of the grounding target to narrow down your search, because your "final_answer_masks" array can only be composed of of mask(s) generated by segment_phrase. For example, when the grounding target is an adult, use the "text_prompt" "adult person" instead of "adult hand".
+12. If the initial user input query refers only to one specific object instance of a category, while there are other object instance(s) of the same category in the image that are not being referred to, you should call segment_phrase with a "text_prompt" that is the singular form of the category of object(s), and then use the select_masks_and_return and/or examine_each_mask tool to narrow down your "final_answer_masks".
+13. Every time you call the segment_phrase tool, all previously generated mask(s) will be deleted. You are forbidden from referring to mask(s) that exist only in previous images in the message history but have been deleted in the most recent turn (not rendered on the most recent image).
+14. You should only ground object(s) that fully match or answer the initial user input query, and ignore object(s) that only partially match the initial user input query. For example, if the user is asking for object(s) used for inputting data and controlling the computer, you should only ground the keyboard and not the mouse, since the mouse is only used for controlling the computer but not for inputting data.
+15. You should never propose a "text_prompt" that covers more area than the initial user input query, for example, if the initial user input query asks specifically for areas of the jeans that are broken, you should never propose the "text_prompt" "jeans" because it will definitely cover more area than the ground truth target.
+16. You should never propose a "text_prompt" that covers less area than the initial user input query, for example, if the initial user input query asks for the person holding a microphone, you should never propose the "text_prompt" "microphone" because it will definitely cover less area than the ground truth target.
+17. You should first try your best to propose a "text_prompt" that covers the exact same object(s) as referred to by the initial user input query, no more, no less. You may not propose a "text_prompt" that covers more object(s) than what is referred to by the initial user input query unless you have tried every creative "text_prompt" you can think of to cover exactly the correct object(s) and none of them worked.
+18. Be creative in your "text_prompt" choice; you may use synonyms and use visual common sense to think of different "text_prompt" choices. You have unlimited turns to call each tool, so take your time!
+examine_each_mask: Use this tool when the segment_phrase tool generates multiple small or overlapping mask(s), making it difficult to distinguish the correct mask(s). examine_each_mask allows you to render and examine each mask independently to see small mask(s) clearly and avoid confusing overlapping mask(s). (examine_each_mask can only be called after segment_phrase has been called at least once.)
+Use cases: "Sometimes there are multiple small mask(s) or overlapping mask(s) rendered on an image, making it difficult to distinguish each mask from others. In this case, you should call the examine_each_mask tool to individually verify each mask and filter out incorrect mask(s)."
+Parameters for examine_each_mask: None
+Return type: A new image with colored segmentation mask(s) accepted by the examine_each_mask tool, and a text message indicating how many masks were accepted.
+Important rules for using the examine_each_mask tool:
+1. You may only call the examine_each_mask tool when you have re-examined the raw input image and the most recent output image, and you are absolutely sure that all the correct mask(s) that match the initial user input query have been rendered on the most recent image, and there are no missing correct mask(s). You must state this explicitly before you call the examine_each_mask tool.
+2. Do not call the examine_each_mask tool if there is only one mask and the mask is not very small.
+3. Do not call the examine_each_mask tool when there are many masks in the image but they are neither very small nor overlapping.
+4. The purpose of calling examine_each_mask is to distinguish overlapping mask(s), to examine whether very small mask(s) are correct, or both.
+5. After you have carefully compared the generated mask(s) against the initial user input query and the original image, and stated that you are absolutely sure that all the correct mask(s) that match the initial user input query have been rendered on the most recent image, you may consider calling the examine_each_mask tool if there are multiple overlapping mask(s) generated and it is not easy for you to name the correct mask(s). For example, if the question is to ground "the cookie behind the other cookie", segment_phrase generates two mask(s) for the two cookies in the image, but they are overlapping. You can also call the examine_each_mask tool if there are one or more very small mask(s) that are generated and you are sure that some of them are correct, and it is not easy for you to directly decide the correct mask(s). For example, if the question is to ground "sharp teeth" and there are multiple small mask(s) generated but it is not easy for you to tell which ones are correct without zooming in on each mask.
+6. Do not call the examine_each_mask tool if there are many masks in the image but you can clearly tell each mask apart from all other mask(s), and there is no significant challenge in identifying the correct mask(s). For example, if the question is asking "where people can sit" and there are many masks for chairs, and you just need to list all the mask numbers for chairs.
+7. You may not call the examine_each_mask tool unless there are two images in the chat context and you can see explicitly numbered masks in the second image.
+select_masks_and_return: Call this tool to select a subset of or all of the mask(s) rendered on the most recent image as your final output. When calling select_masks_and_return, you cannot select any mask(s) generated by previous rounds other than the most recent round in your "final_answer_masks". You can only use mask(s) from the most recent image in your message history. (select_masks_and_return can only be called after segment_phrase has been called at least once.)
+Use cases: "Given an image with one or more segmentation mask(s) already rendered on it, select_masks_and_return returns the set of mask(s) you select as the final output."
+Parameters for select_masks_and_return: {"type": "object", "properties": {"final_answer_masks": {"type": "array", "description": "An array of integers representing the selected mask(s) you want to choose as your final output, e.g., [1, 4, 5]"}}, "required": ["final_answer_masks"]}
+Return type: None (End of Conversation)
+Important rules for using the select_masks_and_return tool:
+1. Do not call select_masks_and_return unless you are absolutely sure that the set of mask(s) you are about to return is the correct set of mask(s) that match or answer the initial user input query.
+2. If at any point in your reasoning you indicated that there exist any target(s) in the image that match or answer the initial user input query, your final tool call must be select_masks_and_return; you cannot just give up grounding and call the report_no_mask tool. This is very important.
+3. The mask(s) are numbered from 1 to N (N being the total number of mask(s) rendered on the most recent image). When you call select_masks_and_return, the integers in your "final_answer_masks" array must be within this range, no exceptions! Make sure of this!
+4. There must never be any repeated integers in your "final_answer_masks" array; each integer must be unique. A "final_answer_masks" such as [1, 2, 3, 2, 1] is not acceptable and will trigger an error. You should avoid this format error at all costs.
+5. You may only call select_masks_and_return on mask(s) rendered in the most recent image. You must ignore any mask(s) from earlier images as they have already been deleted.
+6. The select_masks_and_return tool is what you would use for reporting your "final_answer_masks". If the currently available mask(s) in the most recent image (you cannot use mask(s) from earlier images) are not 100% complete, do not call the select_masks_and_return tool and continue updating them by calling other tools (possibly on more general noun phrases).
+7. Every time you call the segment_phrase tool, you will delete all previously generated mask(s). You are forbidden from selecting mask(s) in previous images in the message history other than the most recent image.
+8. Since you cannot refer to mask(s) generated in earlier calls to segment_phrase, you should plan out your tool calls carefully, and make sure that the most recent tool call to segment_phrase covers all the target object(s) you want to ground.
+9. You may not call the select_masks_and_return tool if there are no mask(s) rendered on the most recent image returned by your most recent tool call.
+10. The mask(s) you choose in your "final_answer_masks" should accurately capture the target object(s) and only the target object(s). It should not contain any other regions that do not belong to the target object(s). Nor should it contain only a part of the target object(s). If this criterion is not met, you must not call the select_masks_and_return tool. Instead, please continue using other tools to generate better mask(s).
+11. Sometimes in the image you might see a mask with a two-digit number that is larger than N (the total number of available mask(s) rendered on the most recent image). For example, if the user tells you there are only 3 masks generated on the most recent image, but you see a mask with the number "12" on it. This is a visual illusion caused by mask "1" and mask "2" being too close to each other. In this case, you should never refer to mask "12" as it does not exist. Instead, you can only refer to masks "1", "2", and "3" as specified in the user input.
+12. If there are a large number of masks you need to select in your "final_answer_masks" array, you are required to explicitly list all of them one by one. You may not use any form of abbreviation or code. For example, if there are 94 correct masks you need to return, you must generate a long response with the "final_answer_masks" being a long array of 94 integers. You must never use abbreviated code outputs such as {"final_answer_masks": [i for i in range(1, 94)]}.
+13. If the initial user input query involves colors, you must carefully double-check the raw input image and explicitly compare it against the most recent image with available mask(s) rendered on it before selecting your "final_answer_masks". This is because the available mask(s) rendered on the most recent image are colored and will change the original color of the object(s) on the raw input image.
+14. Before you are allowed to call the select_masks_and_return tool, you are required to carefully re-examine the raw input image, the initial user input query, and compare them against every single available segmentation mask on the most recent rendered image. You must explicitly restate the initial user input query, and verify the following three things:
+a. You must verify you are able to accurately locate all the correct mask(s) that match the initial user input query in the most recent rendered image.
+b. You must also verify that you have carefully checked each of the mask(s) you plan to select, and made sure that they best match the initial user input query. (list your reasoning for each mask)
+c. You have also verified that the other available mask(s) you do not plan to select are definitely wrong and do not match the initial user input query. (list your reasoning for each mask)
+15. The intermediate "text_prompt" used to call the segment_phrase tool should never be used or considered when you select the "final_answer_masks". Instead, you should only assess the available mask(s) by checking the initial user input query. For example, if the initial user input query was "The plane-shaped cake on the right" and the "text_prompt" you used for the segment_phrase tool was "green cake", you should select the available mask(s) that match "The plane-shaped cake on the right".
+16. If the initial user input query involves relative positions, then you must explicitly state in your thinking process the spatial positions of each mask relative to other available mask(s) before you call the select_masks_and_return tool.
+17. You may not select any mask(s) whose number is greater than 100. For example, you may not select mask 102 or mask 114 in your "final_answer_masks" array. This also means that you are not allowed to select more than 100 masks in your "final_answer_masks" array.
+18. You may not call the select_masks_and_return tool unless there are two images in the chat context and you can see explicitly numbered masks in the second image.
+report_no_mask: Call this tool when you are absolutely sure that there are no object(s) in the image that match or answer the initial user input query.
+Use cases: "Reporting that the given image does not contain any target object(s) that match or answer the initial user input query."
+Parameters for report_no_mask: None
+Return type: None (End of Conversation)
+Important rules for using the report_no_mask tool:
+1. If at any point in your reasoning you indicated that there are target object(s) in the image that exactly match or answer the initial user input query without ambiguity, then you should never call the report_no_mask tool. Instead, you should keep trying other tools with different parameters until you get the correct mask(s).
+2. If you have checked the image carefully and made sure that there are no concepts in the image that can possibly match or answer the initial user input query, you should call the report_no_mask tool.
+3. If the image is completely unrelated to the initial user input query and it seems like the user has provided an incorrect image, you should call the report_no_mask tool. You should never break the standard response format by asking if the user provided the wrong image.
+4. Before you are allowed to call the report_no_mask tool, you are required to carefully re-examine the raw input image and the initial user input query. You must explicitly restate the initial user input query, and analyze the image in detail to verify that there is indeed no object in the image that can possibly match the initial user input query.
+5. Sometimes the initial user input query is slightly wrong but still very much related to the image. For example, the user may ask you to ground "the red computer" when the computer in the image is purple; or the user may ask you to ground "girl on the left" when there is no girl on the left of the image but rather a woman on the left of the image. In these cases, you should accommodate the user errors and still ground the object(s) in the image that best match the initial user input query.
+6. You should seldom call the report_no_mask tool and only reserve it for cases where the initial user input query is completely unrelated to the raw input image.
+7. You must carefully examine all details in the raw input image and note them in your thinking, and reason step-by-step to determine if anything in the image could potentially match the initial user input query. You should not give up the grounding process and call the report_no_mask tool due to very small technicalities or small literal discrepancies. For example, if the user asks you to find a dry space, relatively dry areas like land would satisfy the constraint. If the user asks you to find object(s) that help you focus, headphones and even window shades could potentially serve the purpose. If the user asks you to find containers that can be used for holding hot water, cups or kettles can both work. You should only call the report_no_mask tool if there are very direct contradictions and/or hard constraints in the initial user input query that cause all objects in the raw input image to be invalid matches for the initial user input query.
+Steps for Each Turn:
+First, state the number of images there are in the chat context (There is at least one image and at most two images at any time.) Please note that if the raw input image is composed of two individual images concatenated visually; it still counts as only one image. This is very important!
+Scenario 1: If there is only one image in the context (it must be the raw input image with no mask on it), you must perform the following steps. Steps 1-5 are mandatory thinking steps and therefore must be generated within <think> ..... </think> HTML tags. Step 6 is the mandatory tool calling step and must be generated within <tool> ..... </tool> HTML tags. You must make sure to generate the opening and closing HTML tags correctly.
+Your thinking steps:
+1. Analyze: Carefully describe and analyze the raw input image provided to you in the context of the initial user input query.
+2. Think: Based on your understanding of the image and the previously stated rules for how you should understand the initial user input query, think about precisely what target object(s) need to be grounded to accurately answer the initial user input query.
+3. Remind: Remind yourself that each call to the segment_phrase tool will cause all previously generated mask(s) to be deleted (and can never be referred to again). So you should never design a plan that requires combining output mask(s) from two separate calls to the segment_phrase tool. You must also remind yourself that you should only call the segment_phrase tool on the whole primary grounding target(s), and never call the segment_phrase tool on a uniquely identifying part or attribute of the primary grounding target(s).
+4. Plan: Design a step-by-step tool call plan for how you will use the existing tools to generate mask(s) that accurately ground the object(s) that match or answer the initial user input query.
+5. Decide: Based on your reasoning, determine a simple noun phrase you think is suitable for calling the segment_phrase tool. The phrase should be a simple, direct, singular noun phrase. In some cases, it may include adjectives, but it should never contain articles, possessives, or numbers.
+You mandatory tool call:
+After you finish all 5 thinking steps and have decided the simple noun phrase you think is suitable for calling the segment_phrase tool, you must generate a mandatory tool call to the "segment_phrase" tool with the simple noun phrase you have selected as the "text_prompt". Make sure you closely follow the rules for calling the "segment_phrase" tool, and enclose the tool call within <tool> ..... </tool> HTML tags.
+Scenario 2: If there are exactly two images in the context, the first image must be the raw input image, and the second and most recent image must be the image with all available mask(s) rendered on it. In Scenario 2, you must perform the following steps. Steps 1-5 are mandatory thinking steps and therefore must be generated within <think> ..... </think> HTML tags. Step 6 is the mandatory tool calling step and must be generated within <tool> ..... </tool> HTML tags. You must make sure to generate the opening and closing HTML tags correctly.
+Your steps:
+1. Analyze: Carefully describe and analyze both the first image (the raw input image) and the second and most recent image (the image with all available mask(s) rendered on it) in the context of the initial user input query. If there are fewer than twenty available mask(s) in the second (most recent) image, you are required to analyze each available mask individually on the second and most recent image and state why they are correct, or why they are incorrect. The specific analysis you generate for each mask should be determined based on the initial user input query and the raw input image. If the initial user input query mentions the relation of the target object(s) to other object(s) in the image, you must also explain each mask's relation to other available mask(s). For example, if the initial user input query is "the second man from the right", then your analysis for each available mask must include a direct response to the query, like: "Mask N covers the m-th man from the right".
+2. Think: Determine whether any, some, or all of the target object(s) referred to by the initial user input query have been covered by available mask(s) in the second and most recent image. Re-examine the raw input image carefully to determine whether there are still missing target object(s) in the image that match or answer the initial user input query but are not yet covered by any segmentation mask. After carefully examining the raw input image, if you find that all of the target object(s) referred to by the initial user input query have been covered and that there are no more missing target(s), you must write: "After carefully examining the raw input image, I am certain that all the target(s) referred to by the initial user input query have been covered by available mask(s)."
+3. Remind: If you need to update your step-by-step tool call plan, you must remind yourself that each call to the segment_phrase tool will cause all previously generated mask(s) to be deleted (and can never be referred to again). So you should never design a plan that requires combining output mask(s) from two separate calls to the segment_phrase tool. You must also remind yourself that you should only call the segment_phrase tool on the whole primary grounding target(s), and never call the segment_phrase tool on a uniquely identifying part or attribute of the primary grounding target(s). You must also remind yourself to look closely at both the first raw input image and the second and most recent image with all available mask(s) rendered on it. You must analyze all the available mask(s) one by one and discuss the relative position of each mask to the other mask(s) (if there are multiple masks).
+4. Plan: State whether you need to update your plan based on the tool execution results and user feedback from the previous round. If so, update your step-by-step plan to use the existing tools to generate mask(s) that accurately ground the object(s) that match or answer the initial user input query if necessary.
+5. Decide: Based on your reasoning, decide exactly which tool you should use next and what parameters (if any) you should call the tool with.
+You mandatory tool call:
+After you finish all 5 thinking steps, generate the tool call with the exact tool name and exact parameters you have just selected. You may only call one of the four available tools within: "segment_phrase", "examine_each_mask", "select_masks_and_return", and "report_no_mask". Make sure you closely follow the respective rules for calling each of these tools and enclose the tool call within <tool> ..... </tool> HTML tags.
+Output Format for Scenario 1:
+<think> State that there is only one image in the message history (the raw input image). Since there is only one image, you will follow the Scenario 1 instructions:
+1. Analyze: Carefully describe and analyze the raw input image provided to you in the context of the initial user input query.
+2. Think: Based on your understanding of the image and the previously stated rules for how you should understand the initial user input query, think about precisely what target object(s) need to be grounded to accurately answer the initial user input query.
+3. Remind: Remind yourself that each call to the segment_phrase tool will cause all previously generated mask(s) to be deleted (and can never be referred to again). So you should never design a plan that requires combining output mask(s) from two separate calls to the segment_phrase tool. You must also remind yourself that you should only call the segment_phrase tool on the whole primary grounding target(s), and never call the segment_phrase tool on a uniquely identifying part or attribute of the primary grounding target(s).
+4. Plan: Design a step-by-step tool call plan for how you will use the existing tools to generate mask(s) that accurately ground the object(s) that match or answer the initial user input query.
+5. Decide: Based on your reasoning, determine a simple noun phrase you think is suitable for calling the segment_phrase tool. The phrase should be a simple, direct, singular noun phrase. In some cases, it may include adjectives, but it should never contain articles, possessives, or numbers. </think>
+<tool> {"name": "tool name", "parameters": {"Parameter name": "Parameter content", "... ...": "... ..."}} </tool>
+Stop your response and wait for user feedback.
+Output Format for Scenario 2:
+<think> State exactly how many images there are in the context (there are exactly two). Since there are exactly two images, you will follow the Scenario 2 instructions:
+1. Analyze: Carefully describe and analyze both the first image (the raw input image) and the second and most recent image (the image with all available mask(s) rendered on it) in the context of the initial user input query. If there are fewer than twenty available mask(s) in the second (most recent) image, you are required to analyze each available mask individually on the second and most recent image and state why they are correct, or why they are incorrect. The specific analysis you generate for each mask should be directly related to the initial user input query and the raw input image. If the initial user input query mentions the spatial relation of the target object(s) to other object(s) in the image, you must explain each mask's spatial relation to other available mask(s). For example, if the initial user input query is "the second man from the right", then your analysis for each available mask must include a direct response to the query stating the spatial position of the mask, for example: "Mask 2 covers the third man from the right, the mask is to the left of mask 1 and mask 4, but to the right of mask 3 and mask 5".
+2. Think: Determine whether any, some, or all of the target object(s) referred to by the initial user input query have been covered by available mask(s) in the second and most recent image. Re-examine the raw input image carefully to determine whether there are still missing target object(s) in the image that match or answer the initial user input query but are not yet covered by any segmentation mask. After carefully examining the raw input image, if you find that all of the target object(s) referred to by the initial user input query have been covered and that there are no more missing target(s), you must write: "After carefully examining the raw input image, I am certain that all the target(s) referred to by the initial user input query have been covered by available mask(s)."
+3. Remind: If you need to update your step-by-step tool call plan, you must remind yourself that each call to the segment_phrase tool will cause all previously generated mask(s) to be deleted (and can never be referred to again). So you should never design a plan that requires combining output mask(s) from two separate calls to the segment_phrase tool. You must also remind yourself that you should only call the segment_phrase tool on the whole primary grounding target(s), and never call the segment_phrase tool on a uniquely identifying part or attribute of the primary grounding target(s). You must also remind yourself to look closely at both the first raw input image and the second and most recent image with all available mask(s) rendered on it. You must analyze all the available mask(s) one by one and discuss the relative position of each mask to the other mask(s) (if there are multiple masks).
+4. Plan: State whether you need to update your plan based on the tool execution results and user feedback from the previous round. If so, update your step-by-step plan to use the existing tools to generate mask(s) that accurately ground the object(s) that match or answer the initial user input query if necessary.
+5. Decide: Based on your reasoning, decide exactly which tool you should use next and what parameters (if any) you should call the tool with. </think>
+<tool> {"name": "tool name", "parameters": {"Parameter name": "Parameter content", "... ...": "... ..."}} </tool>
+Important response formatting rules:
+1. You must always include the <think> ..... </think> field to outline your reasoning and the <tool> ..... </tool> field to specify the action you choose to take before you end a turn.
+2. Each tool call should be a JSON object with a "name" field and a "parameters" field containing a dictionary of parameters. If no parameters are needed, leave the "parameters" field as an empty dictionary.
+3. Refer to the previous dialogue history, including the initial user input query, previous reasoning, previous tool calls, and user feedback from previous tool calls.
+4. Do not wrap your entire output in a single large JSON object.
+5. Do not try to output multiple rounds of tool calls in a single turn. Stop immediately after you call one tool.
+6. If your initial attempts do not work out, do not give up; try more tool calls with different parameters. Take as long as you need!
+Please be reminded of the important tool calling rules:
+Important rules for using the segment_phrase tool:
+1. You may use visual adjectives such as color to help identify the concept you want to ground, but do not use complicated descriptors like numbers or mention text that is written on the image as the segment_phrase tool does not have OCR capabilities. For example, use "black ball" instead of "8-ball" to ground a black ball with the number "8" written on it. If the user asks you to ground an object that can only be identified by the text or number written on it, you should generate mask(s) for all object(s) of that category and then cross-examine the original image against the masked image carefully to locate the exact mask(s) that match or answer the initial user input query and select only those mask(s).
+2. Do not try to directly ground words, letters, or numbers in written text on the image. For example, if there is text on a sign to ground, you should use "sign" as your "text_prompt" instead of using the actual text itself as your "text_prompt".
+3. If your call to segment_phrase does not generate any useful mask(s) or if the mask(s) are incomplete, you may want to try calling the segment_phrase tool again using a more general noun phrase. For example, if the "text_prompt" "elementary school teacher" does not give you any mask(s), you can call segment_phrase again with the "text_prompt": "person".
+4. You should avoid identifying concepts using actions, relationships, or comparatives; instead, call segment_phrase on a more general phrase and let the segment_phrase tool generate more mask(s) than you need. Then, in the next turn, you can use the select_masks_and_return tool to remove some mask(s). For example, use "vase" instead of "the bigger vase", use "dog" instead of "the dog lying down", and use "brown pillow" instead of "the pillow on the chair".
+5. If the results of segment_phrase are not what you expected, you can always call segment_phrase again using a different "text_prompt". For example, when grounding a dog's nose, you can try "dog nose" and "black marking" after "nose" does not work.
+6. Sometimes when the target object(s) are too niche and the segment_phrase tool does not provide any mask(s), you may want to try grounding a more general version of the object. For example, when "sundial" does not produce any mask(s), you can try grounding "statue".
+7. Be concise and get the right keywords; don't make your "text_prompt" long.
+8. Do not ever use the exact same "text_prompt" more than once. This is very important!
+9. Sometimes you may find that the user is referring to a person or some people as the main grounding target. In this case, you should absolutely avoid grounding identifying part(s) or attribute(s) of the person or people, even if these part(s) or component(s) are explicitly mentioned in the initial user input query. Instead, you should only call segment_phrase with general "text_prompt"s like "person", "man", "girl", "firefighter", etc. that refer to the person as a whole. Later you can refer back to these identifying part(s) or attribute(s) and look closely at the original image to help you select the correct mask(s).
+10. If a previously used "text_prompt" does not work, avoid using it again and think of a new, creative "text_prompt" that may be indirect but can achieve the target result. For example, when grounding the center of the cake with text written on it, try grounding "birthday greeting" instead.
+11. You should always call segment_phrase with a "text_prompt" that represents the entire grounding target to generate mask(s) that you can choose from (sometimes along with other entities of the same category if it is hard to avoid). Do not call segment_phrase with a "text_prompt" that refers to subpart(s) of the grounding target to narrow down your search, because your "final_answer_masks" array can only be composed of mask(s) generated by segment_phrase. For example, when the grounding target is an adult, use the "text_prompt" "adult person" instead of "adult hand".
+12. If the initial user input query refers only to one specific object instance of a category, while there are other object instance(s) of the same category in the image that are not being referred to, you should call segment_phrase with a "text_prompt" that is the singular form of the category of object(s), and then use the select_masks_and_return and/or examine_each_mask tool to narrow down your "final_answer_masks".
+13. Every time you call the segment_phrase tool, all previously generated mask(s) will be deleted. You are forbidden from referring to mask(s) that exist only in previous images in the message history but have been deleted in the most recent turn (not rendered on the most recent image).
+14. You should only ground object(s) that fully match or answer the initial user input query, and ignore object(s) that only partially match the initial user input query. For example, if the user is asking for object(s) used for inputting data and controlling the computer, you should only ground the keyboard and not the mouse, since the mouse is only used for controlling the computer but not for inputting data.
+15. You should never propose a "text_prompt" that covers more area than the initial user input query, for example, if the initial user input query asks specifically for areas of the jeans that are broken, you should never propose the "text_prompt" "jeans" because it will definitely cover more area than the ground truth target.
+16. You should never propose a "text_prompt" that covers less area than the initial user input query, for example, if the initial user input query asks for the person holding a microphone, you should never propose the "text_prompt" "microphone" because it will definitely cover less area than the ground truth target.
+17. You should first try your best to propose a "text_prompt" that covers the exact same object(s) as referred to by the initial user input query, no more, no less. You may not propose a "text_prompt" that covers more object(s) than what is referred to by the initial user input query unless you have tried every creative "text_prompt" you can think of to cover exactly the correct object(s) and none of them worked.
+18. Be creative in your "text_prompt" choice; you may use synonyms and use visual common sense to think of different "text_prompt" choices. You have unlimited turns to call each tool, so take your time!
+Important rules for using the examine_each_mask tool:
+1. You may only call the examine_each_mask tool when you have re-examined the raw input image and the most recent output image, and you are absolutely sure that all the correct mask(s) that match the initial user input query have been rendered on the most recent image, and there are no missing correct mask(s). You must state this explicitly before you call the examine_each_mask tool.
+2. Do not call the examine_each_mask tool if there is only one mask and the mask is not very small.
+3. Do not call the examine_each_mask tool when there are many masks in the image but they are neither very small nor overlapping.
+4. The purpose of calling examine_each_mask is to distinguish overlapping mask(s), to examine whether very small mask(s) are correct, or both.
+5. After you have carefully compared the generated mask(s) against the initial user input query and the original image, and stated that you are absolutely sure that all the correct mask(s) that match the initial user input query have been rendered on the most recent image, you may consider calling the examine_each_mask tool if there are multiple overlapping mask(s) generated and it is not easy for you to name the correct mask(s). For example, if the question is to ground "the cookie behind the other cookie", segment_phrase generates two mask(s) for the two cookies in the image, but they are overlapping. You can also call the examine_each_mask tool if there are one or more very small mask(s) that are generated and you are sure that some of them are correct, and it is not easy for you to directly decide the correct mask(s). For example, if the question is to ground "sharp teeth" and there are multiple small mask(s) generated but it is not easy for you to tell which ones are correct without zooming in on each mask.
+6. Do not call the examine_each_mask tool if there are many masks in the image but you can clearly tell each mask apart from all other mask(s), and there is no significant challenge in identifying the correct mask(s). For example, if the question is asking "where people can sit" and there are many masks for chairs, and you just need to list all the mask numbers for chairs.
+7. You may not call the examine_each_mask tool unless there are two images in the chat context and you can see explicitly numbered masks in the second image.
+Important rules for using the select_masks_and_return tool:
+1. Do not call select_masks_and_return unless you are absolutely sure that the set of mask(s) you are about to return is the correct set of mask(s) that match or answer the initial user input query.
+2. If at any point in your reasoning you indicated that there exist any target(s) in the image that match or answer the initial user input query, your final tool call must be select_masks_and_return; you cannot just give up grounding and call the report_no_mask tool. This is very important.
+3. The mask(s) are numbered from 1 to N (N being the total number of mask(s) rendered on the most recent image). When you call select_masks_and_return, the integers in your "final_answer_masks" array must be within this range, no exceptions! Make sure of this!
+4. There must never be any repeated integers in your "final_answer_masks" array; each integer must be unique. A "final_answer_masks" such as [1, 2, 3, 2, 1] is not acceptable and will trigger an error. You should avoid this format error at all costs.
+5. You may only call select_masks_and_return on mask(s) rendered in the most recent image. You must ignore any mask(s) from earlier images as they have already been deleted.
+6. The select_masks_and_return tool is what you would use for reporting your "final_answer_masks". If the currently available mask(s) in the most recent image (you cannot use mask(s) from earlier images) are not 100% complete, do not call the select_masks_and_return tool and continue updating them by calling other tools (possibly on more general noun phrases).
+7. Every time you call the segment_phrase tool, you will delete all previously generated mask(s). You are forbidden from selecting mask(s) in previous images in the message history other than the most recent image.
+8. Since you cannot refer to mask(s) generated in earlier calls to segment_phrase, you should plan out your tool calls carefully, and make sure that the most recent tool call to segment_phrase covers all the target object(s) you want to ground.
+9. You may not call the select_masks_and_return tool if there are no mask(s) rendered on the most recent image returned by your most recent tool call.
+10. The mask(s) you choose in your "final_answer_masks" should accurately capture the target object(s) and only the target object(s). It should not contain any other regions that do not belong to the target object(s). Nor should it contain only a part of the target object(s). If this criterion is not met, you must not call the select_masks_and_return tool. Instead, please continue using other tools to generate better mask(s).
+11. Sometimes in the image you might see a mask with a two-digit number that is larger than N (the total number of available mask(s) rendered on the most recent image). For example, if the user tells you there are only 3 masks generated on the most recent image, but you see a mask with the number "12" on it. This is a visual illusion caused by mask "1" and mask "2" being too close to each other. In this case, you should never refer to mask "12" as it does not exist. Instead, you can only refer to masks "1", "2", and "3" as specified in the user input.
+12. If there are a large number of masks you need to select in your "final_answer_masks" array, you are required to explicitly list all of them one by one. You may not use any form of abbreviation or code. For example, if there are 94 correct masks you need to return, you must generate a long response with the "final_answer_masks" being a long array of 94 integers. You must never use abbreviated code outputs such as {"final_answer_masks": [i for i in range(1, 94)]}.
+13. If the initial user input query involves colors, you must carefully double-check the raw input image and explicitly compare it against the most recent image with available mask(s) rendered on it before selecting your "final_answer_masks". This is because the available mask(s) rendered on the most recent image are colored and will change the original color of the object(s) on the raw input image.
+14. Before you are allowed to call the select_masks_and_return tool, you are required to carefully re-examine the raw input image, the initial user input query, and compare them against every single available segmentation mask on the most recent rendered image. You must explicitly restate the initial user input query, and verify the following three things:
+a. You must verify you are able to accurately locate all the correct mask(s) that match the initial user input query in the most recent rendered image.
+b. You must also verify that you have carefully checked each of the mask(s) you plan to select, and made sure that they best match the initial user input query. (list your reasoning for each mask)
+c. You have also verified that the other available mask(s) you do not plan to select are definitely wrong and do not match the initial user input query. (list your reasoning for each mask)
+15. The intermediate "text_prompt" used to call the segment_phrase tool should never be used or considered when you select the "final_answer_masks". Instead, you should only assess the available mask(s) by checking the initial user input query. For example, if the initial user input query was "The plane-shaped cake on the right" and the "text_prompt" you used for the segment_phrase tool was "green cake", you should select the available mask(s) that match "The plane-shaped cake on the right".
+16. If the initial user input query involves relative positions, then you must explicitly state in your thinking process the spatial positions of each mask relative to other available mask(s) before you call the select_masks_and_return tool.
+17. You may not select any mask(s) whose number is greater than 100. For example, you may not select mask 102 or mask 114 in your "final_answer_masks" array. This also means that you are not allowed to select more than 100 masks in your "final_answer_masks" array.
+18. You may not call the select_masks_and_return tool unless there are two images in the chat context and you can see explicitly numbered masks in the second image.
+Important rules for using the report_no_mask tool:
+1. If at any point in your reasoning you indicated that there are target object(s) in the image that exactly match or answer the initial user input query without ambiguity, then you should never call the report_no_mask tool. Instead, you should keep trying other tools with different parameters until you get the correct mask(s).
+2. If you have checked the image carefully and made sure that there are no concepts in the image that can possibly match or answer the initial user input query, you should call the report_no_mask tool.
+3. If the image is completely unrelated to the initial user input query and it seems like the user has provided an incorrect image, you should call the report_no_mask tool. You should never break the standard response format by asking if the user provided the wrong image.
+4. Before you are allowed to call the report_no_mask tool, you are required to carefully re-examine the raw input image and the initial user input query. You must explicitly restate the initial user input query, and analyze the image in detail to verify that there is indeed no object in the image that can possibly match the initial user input query.
+5. Sometimes the initial user input query is slightly wrong but still very much related to the image. For example, the user may ask you to ground "the red computer" when the computer in the image is purple; or the user may ask you to ground "girl on the left" when there is no girl on the left of the image but rather a woman on the left of the image. In these cases, you should accommodate the user errors and still ground the object(s) in the image that best match the initial user input query.
+6. You should seldom call the report_no_mask tool and only reserve it for cases where the initial user input query is completely unrelated to the raw input image.
+7. You must carefully examine all details in the raw input image and note them in your thinking, and reason step-by-step to determine if anything in the image could potentially match the initial user input query. You should not give up the grounding process and call the report_no_mask tool due to very small technicalities or small literal discrepancies. For example, if the user asks you to find a dry space, relatively dry areas like land would satisfy the constraint. If the user asks you to find object(s) that help you focus, headphones and even window shades could potentially serve the purpose. If the user asks you to find containers that can be used for holding hot water, cups or kettles can both work. You should only call the report_no_mask tool if there are very direct contradictions and/or hard constraints in the initial user input query that cause all objects in the raw input image to be invalid matches for the initial user input query.
+Please also be reminded of the following important rules for how you should understand the initial user input query and the raw input image:
+1. If there are multiple instances of the target object class in the image, you should read the initial user input query very carefully and think about whether the initial user input query applies broadly to all the instances or just one specific instance, and ground accordingly.
+2. You should think carefully and find the actual target object(s) the user is asking you to ground. Never call the segment_phrase tool to ground secondary object(s) in the initial user input query that only exist to help you identify the actual target. For example, given the initial user input query 'a giraffe with its head up', you should ground the whole 'giraffe' and not 'the head of the giraffe'. Given the initial user input query 'a person holding a blender with their left hand', you should ground 'person' instead of 'blender' or 'left hand'. Given the initial user input query 'two lovely ladies conversing while walking a dog, behind a bicycle', you should ground 'woman' instead of 'dog' or 'bicycle'. Given the initial user input query "guy with white hat", you should ground the "guy" and not the "white hat".
+3. Sometimes the user will mention or use non-target object(s) in their description to help identify the target object(s), you must make sure not to include mask(s) for those object(s) that are only used for identification purposes. For example, given the initial user input query "a man carrying a young girl", you should only ground the main target the "man" and not include the "young girl" in your final predicted mask(s). Given the initial user input query "a small girl staring at something, along with her older sister", you should only ground the "small girl" and not include her "older sister" in your final predicted mask(s).
+4. Sometimes the target object(s) are not directly named in the description but are clearly referenced, in which case you should focus only on grounding the clearly referenced target object(s). For example, given the initial user input query "something that shows the man is playing golf" and an image of a man holding a golf club, you should ground the phrase "golf club" and not the phrase "man" even though "golf club" is not directly named in the initial user input query.
+5. You must carefully examine all details in the raw input image and note them in your thinking, and reason step-by-step to determine if anything in the image could potentially match the initial user input query. You should not give up the grounding process and call the report_no_mask tool due to very small technicalities or small literal discrepancies. For example, if the user asks you to find a dry space, relatively dry areas like land would satisfy the constraint. If the user asks you to find object(s) that help you focus, headphones and even window shades could potentially serve the purpose. If the user asks you to find containers that can be used for holding hot water, cups or kettles can both work. You should only call the report_no_mask tool if there are very direct contradictions and/or hard constraints in the initial user input query that cause all objects in the raw input image to be invalid matches for the initial user input query.
+6. Sometimes the initial user input query can be slightly wrong but still very much related to the image. For example, the user may ask you to ground "the red laptop" when the laptop computer in the image is purple (in this case you should call segment_phrase on the "text_prompt" "purple laptop computer"); or the user may ask you to ground "girl left" when there is no girl on the left of the image but rather a woman on the left of the image (in this case you should call segment_phrase to ground the phrase "left woman"). In these cases, you should accommodate the user errors and still ground the object(s) in the image that best match the initial user input query. You may slightly modify the initial user input query based on your observation of the original image to better match the user’s intent.
+7. Sometimes the initial user input query may be grammatically incorrect, contain typos, or contain irrelevant information. In these cases, you should not blindly try to ground part(s) of the initial user input query using segment_phrase. Instead, you should reason step by step to think about what the user is actually referring to, and then modify the initial user input query based on your understanding and careful analysis of the raw input image. For example, you may see an initial user input query like "left back to us guy", which you can interpret as the man on the left who is facing the other direction (if you can see such a man exists in the image), and then call segment_phrase on "man" and then select the correct mask. You may also see an initial user input query like "big maybe hotdog middle back taste good", and there are just nine sandwiches in the image placed in three rows, then you can probably infer that the user is trying to ground the sandwich in the middle of the back row. You can then call segment_phrase to ground the phrase "sandwich" and use the select_masks_and_return tool to accurately choose only the sandwich in the middle of the back row in your "final_answer_masks" array.
+8. The correct "final_answer_masks" array should never contain any mask(s) whose number is greater than 100. For example, you may never select mask 102 or mask 114 in your "final_answer_masks" array. This also means that you are never allowed to select more than 100 masks in your "final_answer_masks" array.
+9. Please note that if the raw input image is composed of two individual sub-images concatenated visually; it still counts as only one image. If you find that there are "two" images in the chat context but the "second image" is not the same as the first image overlaid with numbered segmentation masks, this means that the "second image" is actually just a sub-image of the raw input image concatenated with the "first image" to serve as a combined raw input image. In this case, there is actually only one image in the chat context and you should follow the Scenario 1 instructions. This is very important!
+Begin!
+Below are the raw input image and the initial user input query:

sam3/agent/system_prompts/system_prompt_iterative_checking.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+You are a helpful assistant specializing in detail-oriented visual understanding, reasoning, and classification, capable of carefully analyzing a predicted segmentation mask on an image along with zoomed-in views of the area around the predicted segmentation mask to determine whether the object covered by the predicted segmentation mask is one of the correct masks that match the user query.
+The user will provide you with four pieces of information for you to jointly analyze before constructing your final prediction:
+1. A text message that can be either: a referring expression that may match some part(s) of the image, or a question whose answer points to some part(s) of the image.
+2. The raw original image, so you may examine the original image without any distractions from the colored segmentation mask.
+3. The whole original image with the predicted segmentation mask in question rendered on it, so you may examine the segmentation mask in the context of the whole image. This image is particularly useful for cases where the user query requires knowledge of global information. For example, for queries like "the second man from the right" or "the cupcake on the top left corner".
+4. A zoomed-in version of the predicted segmentation mask in question. This image consists of two sub-images connected together, one of the sub-images is the zoomed-in version of the predicted segmentation mask itself, the other sub-image is a slightly zoomed-in view of the bounding-box area around the predicted segmentation mask.
+You should observe and analyze each of the images very carefully, notice all the details in every part and corner of each image, think about what the user is actually referring to, and finally determine whether the predicted segmentation mask is indeed a part of the ground truth or not.
+Here are some more detailed instructions for how you should precisely understand the user query:
+1. If there are multiple instances of the target object class in the image, you should read the user query very carefully and think about whether the user query applies broadly to all the instances or just one specific instance, and whether the predicted segmentation mask is one of the correct instances or not.
+2. You should think carefully and find the actual target object the user is asking you to ground. Do not ever accept masks that cover secondary objects in the user query that only exist to help you identify the actual target. For example, given the query 'a giraffe with its head up', you should only accept a mask that covers the whole 'giraffe' and reject masks that only cover 'the head of the giraffe'. Given the query 'a person holding blender with left hand', you should only accept a mask that covers the whole 'person' instead of a mask that covers 'blender' or 'left hand'. Given the query 'two lovely ladies conversing while walking a dog, behind a bicycle', you should only accept a mask that covers the 'woman' instead of a mask that covers the 'dog' or the 'bicycle'. Given the query "guy with white hat", you should only accept a mask that covers the "guy" and not a mask that covers the "white hat".
+3. Sometimes the user will mention or use non-target objects in their description to help identify the target objects, you must make sure not to accept masks for those objects that are only used for identification purposes. For example, given the query "a man carrying a young girl", you should only accept a mask covering the main target: the "man", and reject any masks that cover the "young girl". Given the query "a small girl staring at something, along with her older sister", you should only accept a mask covering the "small girl" and reject any masks covering her "older sister" in your final predicted masks.
+4. Sometimes the target object is not directly named in the description but clearly referred to, in which case you should only accept masks that clearly cover the referred to target object. For example, given the query "something that shows the man is playing golf" and an image of a man holding a golf club, you should only accept a mask that covers the "golf club" and not a mask that covers the "man" even though "golf club" is not directly named in the query.
+5. You should carefully examine both the input image and the user text query, and reason step-by-step to jointly determine which grounding target actually best matches the user query. For example, if given a picture of a handbag with a soft leather handle and a hard metal chain, and the user query is "the part of bag that is comfortable to carry on the shoulder", you should think carefully about what parts can be used for carrying the bag and also importantly: which part would actually be comfortable to carry on the shoulder. You should perform very careful reasoning on both the image and the user query before determining what is the correct final grounding target.
+Now, please analyze the image and think about whether the predicted segmentation mask is a part of the correct masks that matches with or answers the user query or not. First output your detailed analysis of each input image, and then output your step-by-step reasoning explaining why the predicted segmentation mask is correct or incorrect, and then finally respond with either <verdict>Accept</verdict> or <verdict>Reject</verdict>.
+Please only respond in the following format and never break format for any reason:
+<think>Analyze the user query and the three images: the raw input image, the image with the predicted segmentation mask rendered on it, and the image containing the zoomed-in version of the predicted segmentation mask. Then, think step-by-step about whether the predicted segmentation mask is a correct mask that matches the user query, given your prior analysis.</think>
+<verdict>Accept</verdict> or <verdict>Reject</verdict>

sam3/agent/viz.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import cv2
+import numpy as np
+import pycocotools.mask as mask_utils
+from PIL import Image
+from .helpers.visualizer import Visualizer
+from .helpers.zoom_in import render_zoom_in
+def visualize(
+    input_json: dict,
+    zoom_in_index: int | None = None,
+    mask_alpha: float = 0.15,
+    label_mode: str = "1",
+    font_size_multiplier: float = 1.2,
+    boarder_width_multiplier: float = 0,
+):
+    """
+    Unified visualization function.
+    If zoom_in_index is None:
+        - Render all masks in input_json (equivalent to visualize_masks_from_result_json).
+        - Returns: PIL.Image
+    If zoom_in_index is provided:
+        - Returns two PIL.Images:
+            1) Output identical to zoom_in_and_visualize(input_json, index).
+            2) The same instance rendered via the general overlay using the color
+               returned by (1), equivalent to calling visualize_masks_from_result_json
+               on a single-mask json_i with color=color_hex.
+    """
+    # Common fields
+    orig_h = int(input_json["orig_img_h"])
+    orig_w = int(input_json["orig_img_w"])
+    img_path = input_json["original_image_path"]
+    # ---------- Mode A: Full-scene render ----------
+    if zoom_in_index is None:
+        boxes = np.array(input_json["pred_boxes"])
+        rle_masks = [
+            {"size": (orig_h, orig_w), "counts": rle}
+            for rle in input_json["pred_masks"]
+        ]
+        binary_masks = [mask_utils.decode(rle) for rle in rle_masks]
+        img_bgr = cv2.imread(img_path)
+        if img_bgr is None:
+            raise FileNotFoundError(f"Could not read image: {img_path}")
+        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+        viz = Visualizer(
+            img_rgb,
+            font_size_multiplier=font_size_multiplier,
+            boarder_width_multiplier=boarder_width_multiplier,
+        )
+        viz.overlay_instances(
+            boxes=boxes,
+            masks=rle_masks,
+            binary_masks=binary_masks,
+            assigned_colors=None,
+            alpha=mask_alpha,
+            label_mode=label_mode,
+        )
+        pil_all_masks = Image.fromarray(viz.output.get_image())
+        return pil_all_masks
+    # ---------- Mode B: Zoom-in pair ----------
+    else:
+        idx = int(zoom_in_index)
+        num_masks = len(input_json.get("pred_masks", []))
+        if idx < 0 or idx >= num_masks:
+            raise ValueError(f"zoom_in_index {idx} is out of range (0..{num_masks-1}).")
+        # (1) Replicate zoom_in_and_visualize
+        object_data = {
+            "labels": [{"noun_phrase": f"mask_{idx}"}],
+            "segmentation": {
+                "counts": input_json["pred_masks"][idx],
+                "size": [orig_h, orig_w],
+            },
+        }
+        pil_img = Image.open(img_path)
+        pil_mask_i_zoomed, color_hex = render_zoom_in(
+            object_data, pil_img, mask_alpha=mask_alpha
+        )
+        # (2) Single-instance render with the same color
+        boxes_i = np.array([input_json["pred_boxes"][idx]])
+        rle_i = {"size": (orig_h, orig_w), "counts": input_json["pred_masks"][idx]}
+        bin_i = mask_utils.decode(rle_i)
+        img_bgr_i = cv2.imread(img_path)
+        if img_bgr_i is None:
+            raise FileNotFoundError(f"Could not read image: {img_path}")
+        img_rgb_i = cv2.cvtColor(img_bgr_i, cv2.COLOR_BGR2RGB)
+        viz_i = Visualizer(
+            img_rgb_i,
+            font_size_multiplier=font_size_multiplier,
+            boarder_width_multiplier=boarder_width_multiplier,
+        )
+        viz_i.overlay_instances(
+            boxes=boxes_i,
+            masks=[rle_i],
+            binary_masks=[bin_i],
+            assigned_colors=[color_hex],
+            alpha=mask_alpha,
+            label_mode=label_mode,
+        )
+        pil_mask_i = Image.fromarray(viz_i.output.get_image())
+        return pil_mask_i, pil_mask_i_zoomed

sam3/eval/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved

sam3/eval/cgf1_eval.py ADDED Viewed

	@@ -0,0 +1,703 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import contextlib
+import copy
+import json
+import os
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import List, Union
+import numpy as np
+import pycocotools.mask as maskUtils
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from scipy.optimize import linear_sum_assignment
+from tqdm import tqdm
+@dataclass
+class Metric:
+    name: str
+    # whether the metric is computed at the image level or the box level
+    image_level: bool
+    # iou threshold (None is used for image level metrics or to indicate averaging over all thresholds in [0.5:0.95])
+    iou_threshold: Union[float, None]
+CGF1_METRICS = [
+    Metric(name="cgF1", image_level=False, iou_threshold=None),
+    Metric(name="precision", image_level=False, iou_threshold=None),
+    Metric(name="recall", image_level=False, iou_threshold=None),
+    Metric(name="F1", image_level=False, iou_threshold=None),
+    Metric(name="positive_macro_F1", image_level=False, iou_threshold=None),
+    Metric(name="positive_micro_F1", image_level=False, iou_threshold=None),
+    Metric(name="positive_micro_precision", image_level=False, iou_threshold=None),
+    Metric(name="IL_precision", image_level=True, iou_threshold=None),
+    Metric(name="IL_recall", image_level=True, iou_threshold=None),
+    Metric(name="IL_F1", image_level=True, iou_threshold=None),
+    Metric(name="IL_FPR", image_level=True, iou_threshold=None),
+    Metric(name="IL_MCC", image_level=True, iou_threshold=None),
+    Metric(name="cgF1", image_level=False, iou_threshold=0.5),
+    Metric(name="precision", image_level=False, iou_threshold=0.5),
+    Metric(name="recall", image_level=False, iou_threshold=0.5),
+    Metric(name="F1", image_level=False, iou_threshold=0.5),
+    Metric(name="positive_macro_F1", image_level=False, iou_threshold=0.5),
+    Metric(name="positive_micro_F1", image_level=False, iou_threshold=0.5),
+    Metric(name="positive_micro_precision", image_level=False, iou_threshold=0.5),
+    Metric(name="cgF1", image_level=False, iou_threshold=0.75),
+    Metric(name="precision", image_level=False, iou_threshold=0.75),
+    Metric(name="recall", image_level=False, iou_threshold=0.75),
+    Metric(name="F1", image_level=False, iou_threshold=0.75),
+    Metric(name="positive_macro_F1", image_level=False, iou_threshold=0.75),
+    Metric(name="positive_micro_F1", image_level=False, iou_threshold=0.75),
+    Metric(name="positive_micro_precision", image_level=False, iou_threshold=0.75),
+]
+class COCOCustom(COCO):
+    """COCO class from pycocotools with tiny modifications for speed"""
+    def createIndex(self):
+        # create index
+        print("creating index...")
+        anns, cats, imgs = {}, {}, {}
+        imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
+        if "annotations" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                imgToAnns[ann["image_id"]].append(ann)
+                anns[ann["id"]] = ann
+        if "images" in self.dataset:
+            # MODIFICATION: do not reload imgs if they are already there
+            if self.imgs:
+                imgs = self.imgs
+            else:
+                for img in self.dataset["images"]:
+                    imgs[img["id"]] = img
+            # END MODIFICATION
+        if "categories" in self.dataset:
+            for cat in self.dataset["categories"]:
+                cats[cat["id"]] = cat
+        if "annotations" in self.dataset and "categories" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                catToImgs[ann["category_id"]].append(ann["image_id"])
+        print("index created!")
+        # create class members
+        self.anns = anns
+        self.imgToAnns = imgToAnns
+        self.catToImgs = catToImgs
+        self.imgs = imgs
+        self.cats = cats
+    def loadRes(self, resFile):
+        """
+        Load result file and return a result api object.
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = COCOCustom()
+        res.dataset["info"] = copy.deepcopy(self.dataset.get("info", {}))
+        # MODIFICATION: no copy
+        # res.dataset['images'] = [img for img in self.dataset['images']]
+        res.dataset["images"] = self.dataset["images"]
+        # END MODIFICATION
+        print("Loading and preparing results...")
+        tic = time.time()
+        if type(resFile) == str:
+            with open(resFile) as f:
+                anns = json.load(f)
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, "results in not an array of objects"
+        annsImgIds = [ann["image_id"] for ann in anns]
+        # MODIFICATION: faster and cached subset check
+        if not hasattr(self, "img_id_set"):
+            self.img_id_set = set(self.getImgIds())
+        assert set(annsImgIds).issubset(
+            self.img_id_set
+        ), "Results do not correspond to current coco set"
+        # END MODIFICATION
+        if "caption" in anns[0]:
+            imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
+                [ann["image_id"] for ann in anns]
+            )
+            res.dataset["images"] = [
+                img for img in res.dataset["images"] if img["id"] in imgIds
+            ]
+            for id, ann in enumerate(anns):
+                ann["id"] = id + 1
+        elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
+            res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+            for id, ann in enumerate(anns):
+                bb = ann["bbox"]
+                x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+                if not "segmentation" in ann:
+                    ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+                ann["area"] = bb[2] * bb[3]
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "segmentation" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+            for id, ann in enumerate(anns):
+                # now only support compressed RLE format as segmentation results
+                ann["area"] = maskUtils.area(ann["segmentation"])
+                if not "bbox" in ann:
+                    ann["bbox"] = maskUtils.toBbox(ann["segmentation"])
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "keypoints" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+            for id, ann in enumerate(anns):
+                s = ann["keypoints"]
+                x = s[0::3]
+                y = s[1::3]
+                x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+                ann["area"] = (x1 - x0) * (y1 - y0)
+                ann["id"] = id + 1
+                ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
+        print("DONE (t={:0.2f}s)".format(time.time() - tic))
+        res.dataset["annotations"] = anns
+        # MODIFICATION: inherit images
+        res.imgs = self.imgs
+        # END MODIFICATION
+        res.createIndex()
+        return res
+class CGF1Eval(COCOeval):
+    """
+    This evaluator is based upon COCO evaluation, but evaluates the model in a more realistic setting
+    for downstream applications.
+    See SAM3 paper for the details on the CGF1 metric.
+    Do not use this evaluator directly. Prefer the CGF1Evaluator wrapper.
+    Notes:
+     - This evaluator does not support per-category evaluation (in the way defined by pyCocotools)
+     - In open vocabulary settings, we have different noun-phrases for each image. What we call an "image_id" here is actually an (image, noun-phrase) pair. So in every "image_id" there is only one category, implied by the noun-phrase. Thus we can ignore the usual coco "category" field of the predictions
+    """
+    def __init__(
+        self,
+        coco_gt=None,
+        coco_dt=None,
+        iouType="segm",
+        threshold=0.5,
+    ):
+        """
+        Args:
+            coco_gt (COCO): ground truth COCO API
+            coco_dt (COCO): detections COCO API
+            iou_type (str): type of IoU to evaluate
+            threshold (float): threshold for predictions
+        """
+        super().__init__(coco_gt, coco_dt, iouType)
+        self.threshold = threshold
+        self.params.useCats = False
+        self.params.areaRng = [[0**2, 1e5**2]]
+        self.params.areaRngLbl = ["all"]
+        self.params.maxDets = [1000000]
+    def computeIoU(self, imgId, catId):
+        # Same as the original COCOeval.computeIoU, but without sorting
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        if p.iouType == "segm":
+            g = [g["segmentation"] for g in gt]
+            d = [d["segmentation"] for d in dt]
+        elif p.iouType == "bbox":
+            g = [g["bbox"] for g in gt]
+            d = [d["bbox"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+        ious = maskUtils.iou(d, g, iscrowd)
+        return ious
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        """
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        """
+        p = self.params
+        assert not p.useCats, "This evaluator does not support per-category evaluation."
+        assert catId == -1
+        all_gts = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+        keep_gt = np.array([not g["ignore"] for g in all_gts], dtype=bool)
+        gt = [g for g in all_gts if not g["ignore"]]
+        all_dts = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        keep_dt = np.array([d["score"] >= self.threshold for d in all_dts], dtype=bool)
+        dt = [d for d in all_dts if d["score"] >= self.threshold]
+        if len(gt) == 0 and len(dt) == 0:
+            # This is a "true negative" case, where there are no GTs and no predictions
+            # The box-level metrics are ill-defined, so we don't add them to this dict
+            return {
+                "image_id": imgId,
+                "IL_TP": 0,
+                "IL_TN": 1,
+                "IL_FP": 0,
+                "IL_FN": 0,
+                "num_dt": len(dt),
+            }
+        if len(gt) > 0 and len(dt) == 0:
+            # This is a "false negative" case, where there are GTs but no predictions
+            return {
+                "image_id": imgId,
+                "IL_TP": 0,
+                "IL_TN": 0,
+                "IL_FP": 0,
+                "IL_FN": 1,
+                "TPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "FPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "FNs": np.ones((len(p.iouThrs),), dtype=np.int64) * len(gt),
+                "local_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "local_positive_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "num_dt": len(dt),
+            }
+        # Load pre-computed ious
+        ious = self.ious[(imgId, catId)]
+        # compute matching
+        if len(ious) == 0:
+            ious = np.zeros((len(dt), len(gt)))
+        else:
+            ious = ious[keep_dt, :][:, keep_gt]
+        assert ious.shape == (len(dt), len(gt))
+        matched_dt, matched_gt = linear_sum_assignment(-ious)
+        match_scores = ious[matched_dt, matched_gt]
+        TPs, FPs, FNs = [], [], []
+        IL_perfect = []
+        for thresh in p.iouThrs:
+            TP = (match_scores >= thresh).sum()
+            FP = len(dt) - TP
+            FN = len(gt) - TP
+            assert (
+                FP >= 0 and FN >= 0
+            ), f"FP: {FP}, FN: {FN}, TP: {TP}, match_scores: {match_scores}, len(dt): {len(dt)}, len(gt): {len(gt)}, ious: {ious}"
+            TPs.append(TP)
+            FPs.append(FP)
+            FNs.append(FN)
+            if FP == FN and FP == 0:
+                IL_perfect.append(1)
+            else:
+                IL_perfect.append(0)
+        TPs = np.array(TPs, dtype=np.int64)
+        FPs = np.array(FPs, dtype=np.int64)
+        FNs = np.array(FNs, dtype=np.int64)
+        IL_perfect = np.array(IL_perfect, dtype=np.int64)
+        # compute precision recall and F1
+        precision = TPs / (TPs + FPs + 1e-4)
+        assert np.all(precision <= 1)
+        recall = TPs / (TPs + FNs + 1e-4)
+        assert np.all(recall <= 1)
+        F1 = 2 * precision * recall / (precision + recall + 1e-4)
+        result = {
+            "image_id": imgId,
+            "TPs": TPs,
+            "FPs": FPs,
+            "FNs": FNs,
+            "local_F1s": F1,
+            "IL_TP": (len(gt) > 0) and (len(dt) > 0),
+            "IL_FP": (len(gt) == 0) and (len(dt) > 0),
+            "IL_TN": (len(gt) == 0) and (len(dt) == 0),
+            "IL_FN": (len(gt) > 0) and (len(dt) == 0),
+            "num_dt": len(dt),
+        }
+        if len(gt) > 0 and len(dt) > 0:
+            result["local_positive_F1s"] = F1
+        return result
+    def accumulate(self, p=None):
+        """
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        """
+        if self.evalImgs is None or len(self.evalImgs) == 0:
+            print("Please run evaluate() first")
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        setImgIds = set(p.imgIds)
+        # TPs, FPs, FNs
+        TPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        FPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        pmFPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        FNs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        local_F1s = np.zeros((len(p.iouThrs),), dtype=np.float64)
+        # Image level metrics
+        IL_TPs = 0
+        IL_FPs = 0
+        IL_TNs = 0
+        IL_FNs = 0
+        valid_img_count = 0
+        valid_F1_count = 0
+        evaledImgIds = set()
+        for res in self.evalImgs:
+            if res["image_id"] not in setImgIds:
+                continue
+            evaledImgIds.add(res["image_id"])
+            IL_TPs += res["IL_TP"]
+            IL_FPs += res["IL_FP"]
+            IL_TNs += res["IL_TN"]
+            IL_FNs += res["IL_FN"]
+            if "TPs" not in res:
+                continue
+            TPs += res["TPs"]
+            FPs += res["FPs"]
+            FNs += res["FNs"]
+            valid_img_count += 1
+            if "local_positive_F1s" in res:
+                local_F1s += res["local_positive_F1s"]
+                pmFPs += res["FPs"]
+                if res["num_dt"] > 0:
+                    valid_F1_count += 1
+        assert len(setImgIds - evaledImgIds) == 0, (
+            f"{len(setImgIds - evaledImgIds)} images not evaluated. "
+            f"Here are the IDs of the first 3: {list(setImgIds - evaledImgIds)[:3]}"
+        )
+        # compute precision recall and F1
+        precision = TPs / (TPs + FPs + 1e-4)
+        positive_micro_precision = TPs / (TPs + pmFPs + 1e-4)
+        assert np.all(precision <= 1)
+        recall = TPs / (TPs + FNs + 1e-4)
+        assert np.all(recall <= 1)
+        F1 = 2 * precision * recall / (precision + recall + 1e-4)
+        positive_micro_F1 = (
+            2
+            * positive_micro_precision
+            * recall
+            / (positive_micro_precision + recall + 1e-4)
+        )
+        IL_rec = IL_TPs / (IL_TPs + IL_FNs + 1e-6)
+        IL_prec = IL_TPs / (IL_TPs + IL_FPs + 1e-6)
+        IL_F1 = 2 * IL_prec * IL_rec / (IL_prec + IL_rec + 1e-6)
+        IL_FPR = IL_FPs / (IL_FPs + IL_TNs + 1e-6)
+        IL_MCC = float(IL_TPs * IL_TNs - IL_FPs * IL_FNs) / (
+            (
+                float(IL_TPs + IL_FPs)
+                * float(IL_TPs + IL_FNs)
+                * float(IL_TNs + IL_FPs)
+                * float(IL_TNs + IL_FNs)
+            )
+            ** 0.5
+            + 1e-6
+        )
+        self.eval = {
+            "params": p,
+            "TPs": TPs,
+            "FPs": FPs,
+            "positive_micro_FPs": pmFPs,
+            "FNs": FNs,
+            "precision": precision,
+            "positive_micro_precision": positive_micro_precision,
+            "recall": recall,
+            "F1": F1,
+            "positive_micro_F1": positive_micro_F1,
+            "positive_macro_F1": local_F1s / valid_F1_count,
+            "IL_recall": IL_rec,
+            "IL_precision": IL_prec,
+            "IL_F1": IL_F1,
+            "IL_FPR": IL_FPR,
+            "IL_MCC": IL_MCC,
+        }
+        self.eval["cgF1"] = self.eval["positive_micro_F1"] * self.eval["IL_MCC"]
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results.
+        """
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+        def _summarize(iouThr=None, metric=""):
+            p = self.params
+            iStr = " {:<18} @[ IoU={:<9}] = {:0.3f}"
+            titleStr = "Average " + metric
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+            s = self.eval[metric]
+            # IoU
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(iStr.format(titleStr, iouStr, mean_s))
+            return mean_s
+        def _summarize_single(metric=""):
+            titleStr = "Average " + metric
+            iStr = " {:<35} = {:0.3f}"
+            s = self.eval[metric]
+            print(iStr.format(titleStr, s))
+            return s
+        def _summarizeDets():
+            stats = []
+            for metric in CGF1_METRICS:
+                if metric.image_level:
+                    stats.append(_summarize_single(metric=metric.name))
+                else:
+                    stats.append(
+                        _summarize(iouThr=metric.iou_threshold, metric=metric.name)
+                    )
+            return np.asarray(stats)
+        summarize = _summarizeDets
+        self.stats = summarize()
+def _evaluate(self):
+    """
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    """
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    p.imgIds = list(np.unique(p.imgIds))
+    p.useCats = False
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = [-1]
+    if p.iouType == "segm" or p.iouType == "bbox":
+        computeIoU = self.computeIoU
+    else:
+        raise RuntimeError(f"Unsupported iou {p.iouType}")
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds
+    }
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        self.evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    return p.imgIds, evalImgs
+class CGF1Evaluator:
+    """
+    Wrapper class for cgF1 evaluation.
+    This supports the oracle setting (when several ground-truths are available per image)
+    """
+    def __init__(
+        self,
+        gt_path: Union[str, List[str]],
+        iou_type="segm",
+        verbose=False,
+    ):
+        """
+        Args:
+            gt_path (str or list of str): path(s) to ground truth COCO json file(s)
+            iou_type (str): type of IoU to evaluate
+            threshold (float): threshold for predictions
+        """
+        self.gt_paths = gt_path if isinstance(gt_path, list) else [gt_path]
+        self.iou_type = iou_type
+        self.coco_gts = [COCOCustom(gt) for gt in self.gt_paths]
+        self.verbose = verbose
+        self.coco_evals = []
+        for i, coco_gt in enumerate(self.coco_gts):
+            self.coco_evals.append(
+                CGF1Eval(
+                    coco_gt=coco_gt,
+                    iouType=iou_type,
+                )
+            )
+            self.coco_evals[i].useCats = False
+        exclude_img_ids = set()
+        # exclude_img_ids are the ids that are not exhaustively annotated in any of the other gts
+        for coco_gt in self.coco_gts[1:]:
+            exclude_img_ids = exclude_img_ids.union(
+                {
+                    img["id"]
+                    for img in coco_gt.dataset["images"]
+                    if not img["is_instance_exhaustive"]
+                }
+            )
+        # we only eval on instance exhaustive queries
+        self.eval_img_ids = [
+            img["id"]
+            for img in self.coco_gts[0].dataset["images"]
+            if (img["is_instance_exhaustive"] and img["id"] not in exclude_img_ids)
+        ]
+    def evaluate(self, pred_file: str):
+        """
+        Evaluate the detections using cgF1 metric.
+        Args:
+            pred_file: path to the predictions COCO json file
+        """
+        assert len(self.coco_gts) > 0, "No ground truth provided for evaluation."
+        assert len(self.coco_gts) == len(
+            self.coco_evals
+        ), "Mismatch in number of ground truths and evaluators."
+        if self.verbose:
+            print(f"Loading predictions from {pred_file}")
+        with open(pred_file, "r") as f:
+            preds = json.load(f)
+        if self.verbose:
+            print(f"Loaded {len(preds)} predictions")
+        img2preds = defaultdict(list)
+        for pred in preds:
+            img2preds[pred["image_id"]].append(pred)
+        all_eval_imgs = []
+        for img_id in tqdm(self.eval_img_ids, disable=not self.verbose):
+            results = img2preds[img_id]
+            all_scorings = []
+            for cur_coco_gt, coco_eval in zip(self.coco_gts, self.coco_evals):
+                # suppress pycocotools prints
+                with open(os.devnull, "w") as devnull:
+                    with contextlib.redirect_stdout(devnull):
+                        coco_dt = (
+                            cur_coco_gt.loadRes(results) if results else COCOCustom()
+                        )
+                coco_eval.cocoDt = coco_dt
+                coco_eval.params.imgIds = [img_id]
+                coco_eval.params.useCats = False
+                img_ids, eval_imgs = _evaluate(coco_eval)
+                all_scorings.append(eval_imgs)
+            selected = self._select_best_scoring(all_scorings)
+            all_eval_imgs.append(selected)
+        # After this point, we have selected the best scoring per image among several ground truths
+        # we can now accumulate and summarize, using only the first coco_eval
+        self.coco_evals[0].evalImgs = list(
+            np.concatenate(all_eval_imgs, axis=2).flatten()
+        )
+        self.coco_evals[0].params.imgIds = self.eval_img_ids
+        self.coco_evals[0]._paramsEval = copy.deepcopy(self.coco_evals[0].params)
+        if self.verbose:
+            print(f"Accumulating results")
+        self.coco_evals[0].accumulate()
+        print("cgF1 metric, IoU type={}".format(self.iou_type))
+        self.coco_evals[0].summarize()
+        print()
+        out = {}
+        for i, value in enumerate(self.coco_evals[0].stats):
+            name = CGF1_METRICS[i].name
+            if CGF1_METRICS[i].iou_threshold is not None:
+                name = f"{name}@{CGF1_METRICS[i].iou_threshold}"
+            out[f"cgF1_eval_{self.iou_type}_{name}"] = float(value)
+        return out
+    @staticmethod
+    def _select_best_scoring(scorings):
+        # This function is used for "oracle" type evaluation.
+        # It accepts the evaluation results with respect to several ground truths, and picks the best
+        if len(scorings) == 1:
+            return scorings[0]
+        assert (
+            scorings[0].ndim == 3
+        ), f"Expecting results in [numCats, numAreas, numImgs] format, got {scorings[0].shape}"
+        assert (
+            scorings[0].shape[0] == 1
+        ), f"Expecting a single category, got {scorings[0].shape[0]}"
+        for scoring in scorings:
+            assert (
+                scoring.shape == scorings[0].shape
+            ), f"Shape mismatch: {scoring.shape}, {scorings[0].shape}"
+        selected_imgs = []
+        for img_id in range(scorings[0].shape[-1]):
+            best = scorings[0][:, :, img_id]
+            for scoring in scorings[1:]:
+                current = scoring[:, :, img_id]
+                if "local_F1s" in best[0, 0] and "local_F1s" in current[0, 0]:
+                    # we were able to compute a F1 score for this particular image in both evaluations
+                    # best["local_F1s"] contains the results at various IoU thresholds. We simply take the average for comparision
+                    best_score = best[0, 0]["local_F1s"].mean()
+                    current_score = current[0, 0]["local_F1s"].mean()
+                    if current_score > best_score:
+                        best = current
+                else:
+                    # If we're here, it means that in that in some evaluation we were not able to get a valid local F1
+                    # This happens when both the predictions and targets are empty. In that case, we can assume it's a perfect prediction
+                    if "local_F1s" not in current[0, 0]:
+                        best = current
+            selected_imgs.append(best)
+        result = np.stack(selected_imgs, axis=-1)
+        assert result.shape == scorings[0].shape
+        return result

sam3/eval/coco_eval.py ADDED Viewed

	@@ -0,0 +1,916 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""
+COCO evaluator that works in distributed mode.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+import contextlib
+import copy
+import json
+import logging
+import os
+import pickle
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, List, Optional
+import numpy as np
+import pycocotools.mask as mask_utils
+import torch
+from iopath.common.file_io import g_pathmgr
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from sam3.train.masks_ops import rle_encode
+from sam3.train.utils.distributed import (
+    all_gather,
+    gather_to_rank_0_via_filesys,
+    get_rank,
+    is_main_process,
+)
+RARITY_BUCKETS = {0: "frequent", 1: "common", 2: "medium", 3: "rare"}
+class CocoEvaluator:
+    def __init__(
+        self,
+        coco_gt,
+        iou_types: List[str],
+        useCats: bool,
+        dump_dir: Optional[str],
+        postprocessor,
+        average_by_rarity=False,
+        metrics_dump_dir: Optional[str] = None,
+        gather_pred_via_filesys=False,
+        use_normalized_areas=True,
+        maxdets=[1, 10, 100],
+        exhaustive_only=False,
+        all_exhaustive_only=True,
+    ):
+        """Online coco evaluator. It will evaluate images as they are generated by the model, then accumulate/summarize at the end
+        Args:
+           - coco_gt: COCO api object containing the gt
+           - iou_types: can be either "bbox" or "segm"
+           - useCats: If true, categories will be used for evaluation
+           - dump_dir: if non null, then the predictions will be dumped in that directory
+           - postprocessor: Module to convert the model's output into the coco format
+           - average_by_rarity: if true then we expect the images information in the gt dataset
+                 to have a "rarity" field. Then the AP will be computed on all rarity buckets
+                 individually, then averaged
+           - gather_pred_via_filesys: if true, we use the filesystem for collective gathers
+           - use_normalized_areas: if true, the areas of the objects in the GT are assumed to be
+                 normalized by the area of the image. In that case, the size buckets are adjusted
+           - maxdets: maximal number of detections to be evaluated on each image.
+           - exhaustive_only: If true, we restrict eval only to exhaustive annotations
+           - all_exhaustive_only: If true, datapoints are restricted only to those with all exhaustive annotations
+        """
+        # coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gts = [coco_gt] if not isinstance(coco_gt, list) else coco_gt
+        assert len(maxdets) == 3, f"expecting 3 detection threshold, got {len(maxdets)}"
+        self.use_normalized_areas = use_normalized_areas
+        self.iou_types = iou_types
+        self.useCats = useCats
+        self.maxdets = maxdets
+        self.dump = None
+        self.dump_dir = dump_dir
+        if self.dump_dir is not None:
+            self.dump = []
+            if is_main_process():
+                if not os.path.exists(self.dump_dir):
+                    os.makedirs(self.dump_dir, exist_ok=True)
+                    logging.info(f"Create the folder: {dump_dir}")
+        self.initialized = False
+        # Whether to gather predictions through filesystem (instead of torch
+        # collective ops; requiring a shared filesystem across all ranks)
+        self.gather_pred_via_filesys = gather_pred_via_filesys
+        self.use_self_evaluate = True  # CPP version is disabled
+        self.postprocessor = postprocessor
+        self.average_by_rarity = average_by_rarity
+        self.exhaustive_only = exhaustive_only
+        self.all_exhaustive_only = all_exhaustive_only
+        self.metrics_dump_dir = metrics_dump_dir
+        if self.metrics_dump_dir is not None:
+            if is_main_process():
+                if not os.path.exists(self.metrics_dump_dir):
+                    os.makedirs(self.metrics_dump_dir, exist_ok=True)
+                    logging.info(f"Create the folder: {metrics_dump_dir}")
+    def _lazy_init(self, coco_cls=COCO):
+        if self.initialized:
+            return
+        self.initialized = True
+        self.coco_gts = [
+            coco_cls(g_pathmgr.get_local_path(gt)) if isinstance(gt, str) else gt
+            for gt in self.coco_gts
+        ]
+        self.reset()
+        self.eval_img_ids = None
+        if self.exhaustive_only:
+            exclude_img_ids = set()
+            # exclude_img_ids are the ids that are not exhaustively annotated in any of the other gts
+            if self.all_exhaustive_only:
+                for coco_gt in self.coco_gts[1:]:
+                    exclude_img_ids = exclude_img_ids.union(
+                        {
+                            img["id"]
+                            for img in coco_gt.dataset["images"]
+                            if not img["is_instance_exhaustive"]
+                        }
+                    )
+            # we only eval on instance exhaustive queries
+            self.eval_img_ids = [
+                img["id"]
+                for img in self.coco_gts[0].dataset["images"]
+                if (img["is_instance_exhaustive"] and img["id"] not in exclude_img_ids)
+            ]
+        self.rarity_buckets = None
+        if self.average_by_rarity:
+            self.rarity_buckets = defaultdict(list)
+            eval_img_ids_set = (
+                set(self.eval_img_ids) if self.eval_img_ids is not None else None
+            )
+            for img in self.coco_gts[0].dataset["images"]:
+                if self.eval_img_ids is not None and img["id"] not in eval_img_ids_set:
+                    continue
+                self.rarity_buckets[img["rarity"]].append(img["id"])
+            print("Rarity buckets sizes:")
+            for k, v in self.rarity_buckets.items():
+                print(f"{k}: {len(v)}")
+    def set_sync_device(self, device: torch.device) -> Any:
+        self._sync_device = device
+    def _evaluate(self, *args, **kwargs):
+        return evaluate(*args, **kwargs)
+    def _loadRes(self, *args, **kwargs):
+        return loadRes(*args, **kwargs)
+    def update(self, *args, **kwargs):
+        self._lazy_init()
+        predictions = self.postprocessor.process_results(*args, **kwargs)
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            self._dump(results)
+            assert len(self.coco_gts) == len(self.coco_evals)
+            all_scorings = []
+            for cur_coco_gt, cur_coco_eval in zip(self.coco_gts, self.coco_evals):
+                # suppress pycocotools prints
+                with open(os.devnull, "w") as devnull:
+                    with contextlib.redirect_stdout(devnull):
+                        coco_dt = (
+                            self._loadRes(cur_coco_gt, results) if results else COCO()
+                        )
+                coco_eval = cur_coco_eval[iou_type]
+                coco_eval.cocoDt = coco_dt
+                coco_eval.params.imgIds = list(img_ids)
+                coco_eval.params.useCats = self.useCats
+                coco_eval.params.maxDets = self.maxdets
+                img_ids, eval_imgs = self._evaluate(coco_eval, self.use_self_evaluate)
+                all_scorings.append(eval_imgs)
+            selected = self.select_best_scoring(all_scorings)
+            self.eval_imgs[iou_type].append(selected)
+    def select_best_scoring(self, scorings):
+        # This function is used for "oracle" type evaluation.
+        # It accepts the evaluation results with respect to several ground truths, and picks the best
+        if len(scorings) == 1:
+            return scorings[0]
+        # Currently we don't support Oracle Phrase AP.
+        # To implement it, we likely need to modify the cpp code since the eval_image type is opaque
+        raise RuntimeError("Not implemented")
+    def _dump(self, results):
+        if self.dump is not None:
+            dumped_results = copy.deepcopy(results)
+            for r in dumped_results:
+                if "bbox" not in self.iou_types and "bbox" in r:
+                    del r["bbox"]
+                elif "bbox" in r:
+                    r["bbox"] = [round(coord, 5) for coord in r["bbox"]]
+                r["score"] = round(r["score"], 5)
+            self.dump.extend(dumped_results)
+    def synchronize_between_processes(self):
+        self._lazy_init()
+        logging.info("Coco evaluator: Synchronizing between processes")
+        for iou_type in self.iou_types:
+            if len(self.eval_imgs[iou_type]) > 0:
+                self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            else:
+                num_areas = len(self.coco_evals[0][iou_type].params.areaRng)
+                # assuming 1 class
+                assert not self.useCats
+                self.eval_imgs[iou_type] = np.empty((1, num_areas, 0))
+            create_common_coco_eval(
+                self.coco_evals[0][iou_type],
+                self.img_ids,
+                self.eval_imgs[iou_type],
+                use_self_evaluate=self.use_self_evaluate,
+                gather_pred_via_filesys=self.gather_pred_via_filesys,
+                metrics_dump_dir=self.metrics_dump_dir,
+            )
+        if self.dump is not None:
+            dumped_file = Path(self.dump_dir) / f"coco_predictions_{get_rank()}.json"
+            logging.info(f"COCO evaluator: Dumping local predictions to {dumped_file}")
+            with g_pathmgr.open(str(dumped_file), "w") as f:
+                json.dump(self.dump, f)
+            # if self.gather_pred_via_filesys:
+            #     dump = gather_to_rank_0_via_filesys(self.dump)
+            # else:
+            #     dump = all_gather(self.dump, force_cpu=True)
+            # self.dump = sum(dump, [])
+    def accumulate(self, imgIds=None):
+        self._lazy_init()
+        logging.info(
+            f"Coco evaluator: Accumulating on {len(imgIds) if imgIds is not None else 'all'} images"
+        )
+        if not is_main_process():
+            return
+        if imgIds is None:
+            for coco_eval in self.coco_evals[0].values():
+                accumulate(coco_eval, use_self_eval=self.use_self_evaluate)
+        if imgIds is not None:
+            imgIds = set(imgIds)
+            for coco_eval in self.coco_evals[0].values():
+                p = coco_eval.params
+                id_mask = np.array([(i in imgIds) for i in p.imgIds], dtype=bool)
+                old_img_ids = p.imgIds
+                coco_eval.params.imgIds = np.asarray(p.imgIds)[id_mask]
+                old_img_evals = coco_eval.evalImgs
+                catIds = p.catIds if p.useCats else [-1]
+                coco_eval.evalImgs = list(
+                    np.asarray(coco_eval.evalImgs)
+                    .reshape(len(catIds), len(p.areaRng), len(old_img_ids))[
+                        ..., id_mask
+                    ]
+                    .flatten()
+                )
+                accumulate(coco_eval, use_self_eval=self.use_self_evaluate)
+                coco_eval.evalImgs = old_img_evals
+                coco_eval.params.imgIds = old_img_ids
+    def summarize(self):
+        self._lazy_init()
+        logging.info("Coco evaluator: Summarizing")
+        if not is_main_process():
+            return {}
+        outs = {}
+        if self.rarity_buckets is None:
+            self.accumulate(self.eval_img_ids)
+            for iou_type, coco_eval in self.coco_evals[0].items():
+                print("IoU metric: {}".format(iou_type))
+                summarize(coco_eval)
+            if "bbox" in self.coco_evals[0]:
+                for key, value in zip(*self.coco_evals[0]["bbox"].stats):
+                    outs[f"coco_eval_bbox_{key}"] = value
+            if "segm" in self.coco_evals[0]:
+                for key, value in zip(*self.coco_evals[0]["segm"].stats):
+                    outs[f"coco_eval_masks_{key}"] = value
+        else:
+            total_stats = {}
+            all_keys = {}
+            for bucket, img_list in self.rarity_buckets.items():
+                self.accumulate(imgIds=img_list)
+                bucket_name = RARITY_BUCKETS[bucket]
+                for iou_type, coco_eval in self.coco_evals[0].items():
+                    print(f"IoU metric: {iou_type}. Rarity bucket: {bucket_name}")
+                    summarize(coco_eval)
+                if "bbox" in self.coco_evals[0]:
+                    if "bbox" not in total_stats:
+                        total_stats["bbox"] = np.zeros_like(
+                            self.coco_evals[0]["bbox"].stats[1]
+                        )
+                        all_keys["bbox"] = self.coco_evals[0]["bbox"].stats[0]
+                    total_stats["bbox"] += self.coco_evals[0]["bbox"].stats[1]
+                    for key, value in zip(*self.coco_evals[0]["bbox"].stats):
+                        outs[f"coco_eval_bbox_{bucket_name}_{key}"] = value
+                if "segm" in self.coco_evals[0]:
+                    if "segm" not in total_stats:
+                        total_stats["segm"] = np.zeros_like(
+                            self.coco_evals[0]["segm"].stats[1]
+                        )
+                        all_keys["segm"] = self.coco_evals[0]["segm"].stats[0]
+                    total_stats["segm"] += self.coco_evals[0]["segm"].stats[1]
+                    for key, value in zip(*self.coco_evals[0]["segm"].stats):
+                        outs[f"coco_eval_masks_{bucket_name}_{key}"] = value
+            if "bbox" in total_stats:
+                total_stats["bbox"] /= len(self.rarity_buckets)
+                for key, value in zip(all_keys["bbox"], total_stats["bbox"]):
+                    outs[f"coco_eval_bbox_{key}"] = value
+            if "segm" in total_stats:
+                total_stats["segm"] /= len(self.rarity_buckets)
+                for key, value in zip(all_keys["segm"], total_stats["segm"]):
+                    outs[f"coco_eval_masks_{key}"] = value
+        # if self.dump is not None:
+        #     assert self.dump_dir is not None
+        #     logging.info("Coco evaluator: Dumping the global result file to disk")
+        #     with g_pathmgr.open(str(Path(self.dump_dir) / "coco_eval.json"), "w") as f:
+        #         json.dump(self.dump, f)
+        return outs
+    def compute_synced(self):
+        self._lazy_init()
+        self.synchronize_between_processes()
+        return self.summarize()
+    def compute(self):
+        self._lazy_init()
+        return {"": 0.0}
+    def reset(self, cocoeval_cls=COCOeval):
+        self.coco_evals = [{} for _ in range(len(self.coco_gts))]
+        for i, coco_gt in enumerate(self.coco_gts):
+            for iou_type in self.iou_types:
+                self.coco_evals[i][iou_type] = cocoeval_cls(coco_gt, iouType=iou_type)
+                self.coco_evals[i][iou_type].params.useCats = self.useCats
+                self.coco_evals[i][iou_type].params.maxDets = self.maxdets
+                if self.use_normalized_areas:
+                    self.coco_evals[i][iou_type].params.areaRng = [
+                        [0, 1e5],
+                        [0, 0.001],
+                        [0.001, 0.01],
+                        [0.01, 0.1],
+                        [0.1, 0.5],
+                        [0.5, 0.95],
+                        [0.95, 1e5],
+                    ]
+                    self.coco_evals[i][iou_type].params.areaRngLbl = [
+                        "all",
+                        "tiny",
+                        "small",
+                        "medium",
+                        "large",
+                        "huge",
+                        "whole_image",
+                    ]
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in self.iou_types}
+        if self.dump is not None:
+            self.dump = []
+    def write(self, stats):
+        self._lazy_init()
+        """Write the results in the stats dict"""
+        if "bbox" in self.coco_evals[0]:
+            stats["coco_eval_bbox"] = self.coco_evals[0]["bbox"].stats.tolist()
+        if "segm" in self.coco_evals[0]:
+            stats["coco_eval_masks"] = self.coco_evals[0]["segm"].stats.tolist()
+        return stats
+    def prepare(self, predictions, iou_type):
+        self._lazy_init()
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+    def prepare_for_coco_detection(self, predictions):
+        self._lazy_init()
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+    @torch.no_grad()
+    def prepare_for_coco_segmentation(self, predictions):
+        self._lazy_init()
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            boundaries, dilated_boundaries = None, None
+            if "boundaries" in prediction:
+                boundaries = prediction["boundaries"]
+                dilated_boundaries = prediction["dilated_boundaries"]
+                assert dilated_boundaries is not None
+                assert len(scores) == len(boundaries)
+            if "masks_rle" in prediction:
+                rles = prediction["masks_rle"]
+                areas = []
+                for rle in rles:
+                    cur_area = mask_utils.area(rle)
+                    h, w = rle["size"]
+                    areas.append(cur_area / (h * w))
+            else:
+                masks = prediction["masks"]
+                masks = masks > 0.5
+                h, w = masks.shape[-2:]
+                areas = masks.flatten(1).sum(1) / (h * w)
+                areas = areas.tolist()
+                rles = rle_encode(masks.squeeze(1))
+                # memory clean
+                del masks
+                del prediction["masks"]
+            assert len(areas) == len(rles) == len(scores)
+            for k, rle in enumerate(rles):
+                payload = {
+                    "image_id": original_id,
+                    "category_id": labels[k],
+                    "segmentation": rle,
+                    "score": scores[k],
+                    "area": areas[k],
+                }
+                if boundaries is not None:
+                    payload["boundary"] = boundaries[k]
+                    payload["dilated_boundary"] = dilated_boundaries[k]
+                coco_results.append(payload)
+        return coco_results
+    def prepare_for_coco_keypoint(self, predictions):
+        self._lazy_init()
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "keypoints": keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(-1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=-1)
+def merge(img_ids, eval_imgs, gather_pred_via_filesys=False):
+    if gather_pred_via_filesys:
+        # only gather the predictions to rank 0 (other ranks will receive empty
+        # lists for `all_img_ids` and `all_eval_imgs`, which should be OK as
+        # merging and evaluation are only done on rank 0)
+        all_img_ids = gather_to_rank_0_via_filesys(img_ids)
+        all_eval_imgs = gather_to_rank_0_via_filesys(eval_imgs)
+    else:
+        all_img_ids = all_gather(img_ids, force_cpu=True)
+        all_eval_imgs = all_gather(eval_imgs, force_cpu=True)
+    if not is_main_process():
+        return None, None
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+    return merged_img_ids, merged_eval_imgs
+def create_common_coco_eval(
+    coco_eval,
+    img_ids,
+    eval_imgs,
+    use_self_evaluate,
+    gather_pred_via_filesys=False,
+    metrics_dump_dir=None,
+):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs, gather_pred_via_filesys)
+    if not is_main_process():
+        return
+    if metrics_dump_dir is not None:
+        dumped_file = (
+            Path(metrics_dump_dir) / f"coco_eval_img_metrics_{get_rank()}.json"
+        )
+        logging.info(f"COCO evaluator: Dumping local predictions to {dumped_file}")
+        with g_pathmgr.open(str(dumped_file), "w") as f:
+            json.dump(eval_imgs.squeeze(), f, default=lambda x: x.tolist())
+    img_ids = list(img_ids)
+    # If some images were not predicted, we need to create dummy detections for them
+    missing_img_ids = set(coco_eval.cocoGt.getImgIds()) - set(img_ids)
+    if len(missing_img_ids) > 0:
+        print(f"WARNING: {len(missing_img_ids)} images were not predicted!")
+        coco_eval.cocoDt = COCO()
+        coco_eval.params.imgIds = list(missing_img_ids)
+        new_img_ids, new_eval_imgs = evaluate(coco_eval, use_self_evaluate)
+        img_ids.extend(new_img_ids)
+        eval_imgs = np.concatenate((eval_imgs, new_eval_imgs), axis=2)
+    eval_imgs = list(eval_imgs.flatten())
+    assert len(img_ids) == len(coco_eval.cocoGt.getImgIds())
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+# Copy of COCO prepare, but doesn't convert anntoRLE
+def segmentation_prepare(self):
+    """
+    Prepare ._gts and ._dts for evaluation based on params
+    :return: None
+    """
+    p = self.params
+    if p.useCats:
+        gts = self.cocoGt.loadAnns(
+            self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+        )
+        dts = self.cocoDt.loadAnns(
+            self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+        )
+    else:
+        gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+        dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+    for gt in gts:
+        gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
+        gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
+        if p.iouType == "keypoints":
+            gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
+    self._gts = defaultdict(list)  # gt for evaluation
+    self._dts = defaultdict(list)  # dt for evaluation
+    for gt in gts:
+        self._gts[gt["image_id"], gt["category_id"]].append(gt)
+    for dt in dts:
+        self._dts[dt["image_id"], dt["category_id"]].append(dt)
+    self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+    self.eval = {}  # accumulated evaluation results
+def evaluate(self, use_self_evaluate):
+    """
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    """
+    # tic = time.time()
+    # print('Running per image evaluation...', use_self_evaluate)
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = "segm" if p.useSegm == 1 else "bbox"
+        print(
+            "useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType)
+        )
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+    if p.iouType == "segm" or p.iouType == "bbox":
+        computeIoU = self.computeIoU
+    elif p.iouType == "keypoints":
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds
+    }
+    maxDet = p.maxDets[-1]
+    if use_self_evaluate:
+        evalImgs = [
+            self.evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId in catIds
+            for areaRng in p.areaRng
+            for imgId in p.imgIds
+        ]
+        # this is NOT in the pycocotools code, but could be done outside
+        evalImgs = np.asarray(evalImgs).reshape(
+            len(catIds), len(p.areaRng), len(p.imgIds)
+        )
+        return p.imgIds, evalImgs
+    # <<<< Beginning of code differences with original COCO API
+    # def convert_instances_to_cpp(instances, is_det=False):
+    #     # Convert annotations for a list of instances in an image to a format that's fast
+    #     # to access in C++
+    #     instances_cpp = []
+    #     for instance in instances:
+    #         instance_cpp = _CPP.InstanceAnnotation(
+    #             int(instance["id"]),
+    #             instance["score"] if is_det else instance.get("score", 0.0),
+    #             instance["area"],
+    #             bool(instance.get("iscrowd", 0)),
+    #             bool(instance.get("ignore", 0)),
+    #         )
+    #         instances_cpp.append(instance_cpp)
+    #     return instances_cpp
+    # # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+    # ground_truth_instances = [
+    #     [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+    #     for imgId in p.imgIds
+    # ]
+    # detected_instances = [
+    #     [
+    #         convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
+    #         for catId in p.catIds
+    #     ]
+    #     for imgId in p.imgIds
+    # ]
+    # ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+    # if not p.useCats:
+    #     # For each image, flatten per-category lists into a single list
+    #     ground_truth_instances = [
+    #         [[o for c in i for o in c]] for i in ground_truth_instances
+    #     ]
+    #     detected_instances = [[[o for c in i for o in c]] for i in detected_instances]
+    # # Call C++ implementation of self.evaluateImgs()
+    # _evalImgs_cpp = _CPP.COCOevalEvaluateImages(
+    #     p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances
+    # )
+    # self._paramsEval = copy.deepcopy(self.params)
+    # evalImgs = np.asarray(_evalImgs_cpp).reshape(
+    #     len(catIds), len(p.areaRng), len(p.imgIds)
+    # )
+    # return p.imgIds, evalImgs
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################
+#################################################################
+# From pycocotools, but disabled mask->box conversion which is
+# pointless
+#################################################################
+def loadRes(self, resFile):
+    """
+    Load result file and return a result api object.
+    :param   resFile (str)     : file name of result file
+    :return: res (obj)         : result api object
+    """
+    res = COCO()
+    res.dataset["images"] = [img for img in self.dataset["images"]]
+    if type(resFile) == str:
+        anns = json.load(open(resFile))
+    elif type(resFile) == np.ndarray:
+        anns = self.loadNumpyAnnotations(resFile)
+    else:
+        anns = resFile
+    assert type(anns) == list, "results in not an array of objects"
+    annsImgIds = [ann["image_id"] for ann in anns]
+    assert set(annsImgIds) == (
+        set(annsImgIds) & set(self.getImgIds())
+    ), "Results do not correspond to current coco set"
+    if "caption" in anns[0]:
+        imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
+            [ann["image_id"] for ann in anns]
+        )
+        res.dataset["images"] = [
+            img for img in res.dataset["images"] if img["id"] in imgIds
+        ]
+        for id, ann in enumerate(anns):
+            ann["id"] = id + 1
+    elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
+        res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+        for id, ann in enumerate(anns):
+            bb = ann["bbox"]
+            x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+            if "segmentation" not in ann:
+                ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+            ann["area"] = bb[2] * bb[3]
+            ann["id"] = id + 1
+            ann["iscrowd"] = 0
+    elif "segmentation" in anns[0]:
+        res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+        for id, ann in enumerate(anns):
+            # now only support compressed RLE format as segmentation results
+            # ann["area"] = mask_util.area(ann["segmentation"])
+            # The following lines are disabled because they are pointless
+            #  if not 'bbox' in ann:
+            #     ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
+            ann["id"] = id + 1
+            ann["iscrowd"] = 0
+    elif "keypoints" in anns[0]:
+        res.dataset["categories"] = copy.deepcopy(self.dataset["categories"])
+        for id, ann in enumerate(anns):
+            s = ann["keypoints"]
+            x = s[0::3]
+            y = s[1::3]
+            x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+            ann["area"] = (x1 - x0) * (y1 - y0)
+            ann["id"] = id + 1
+            ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
+    res.dataset["annotations"] = anns
+    res.createIndex()
+    return res
+#################################################################
+# end of straight copy from pycocotools
+#################################################################
+#################################################################
+# From pycocotools, but added handling of custom area rngs, and returns stat keys
+#################################################################
+def summarize(self):
+    """
+    Compute and display summary metrics for evaluation results.
+    Note this functin can *only* be applied on the default parameter setting
+    """
+    def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
+        p = self.params
+        iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+        titleStr = "Average Precision" if ap == 1 else "Average Recall"
+        typeStr = "(AP)" if ap == 1 else "(AR)"
+        iouStr = (
+            "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+            if iouThr is None
+            else "{:0.2f}".format(iouThr)
+        )
+        aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+        mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+        if ap == 1:
+            # dimension of precision: [TxRxKxAxM]
+            s = self.eval["precision"]
+            # IoU
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+            s = s[:, :, :, aind, mind]
+        else:
+            # dimension of recall: [TxKxAxM]
+            s = self.eval["recall"]
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+            s = s[:, :, aind, mind]
+        if len(s[s > -1]) == 0:
+            mean_s = -1
+        else:
+            mean_s = np.mean(s[s > -1])
+        print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
+        return mean_s
+    def _summarizeDets():
+        nb_results = 6 + (len(self.params.areaRng) - 1) * 2
+        assert len(self.params.areaRng) == len(self.params.areaRngLbl)
+        stats = np.zeros((nb_results,))
+        keys = ["AP", "AP_50", "AP_75"]
+        stats[0] = _summarize(1, maxDets=self.params.maxDets[2])
+        stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
+        stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
+        cur_id = 3
+        for area in self.params.areaRngLbl[1:]:
+            stats[cur_id] = _summarize(1, areaRng=area, maxDets=self.params.maxDets[2])
+            cur_id += 1
+            keys.append(f"AP_{area}")
+        stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[0])
+        cur_id += 1
+        stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[1])
+        cur_id += 1
+        stats[cur_id] = _summarize(0, maxDets=self.params.maxDets[2])
+        cur_id += 1
+        keys += ["AR", "AR_50", "AR_75"]
+        for area in self.params.areaRngLbl[1:]:
+            stats[cur_id] = _summarize(0, areaRng=area, maxDets=self.params.maxDets[2])
+            cur_id += 1
+            keys.append(f"AR_{area}")
+        assert len(stats) == len(keys)
+        return keys, stats
+    if not self.eval:
+        raise Exception("Please run accumulate() first")
+    self.stats = _summarizeDets()
+#################################################################
+# end of straight copy from pycocotools
+#################################################################
+#################################################################
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/fast_eval_api.py
+# with slight adjustments
+#################################################################
+def accumulate(self, use_self_eval=False):
+    """
+    Accumulate per image evaluation results and store the result in self.eval.  Does not
+    support changing parameter settings from those used by self.evaluate()
+    """
+    if use_self_eval:
+        self.accumulate()
+        return
+    # CPP code is disabled
+    # self.eval = _CPP.COCOevalAccumulate(self.params, self.evalImgs)
+    # # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+    # self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+    #     self.eval["counts"][:1] + self.eval["counts"][2:]
+    # )
+    # # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+    # # num_area_ranges X num_max_detections
+    # self.eval["precision"] = np.array(self.eval["precision"]).reshape(
+    #     self.eval["counts"]
+    # )
+    # self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])

sam3/eval/coco_eval_offline.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""
+This evaluator is meant for regular COCO mAP evaluation, for example on the COCO val set.
+For Category mAP, we need the model to make predictions for all the categories on every single image.
+In general, since the number of classes can be big, and the API model makes predictions individually for each pair (image, class),
+we may need to split the inference process for a given image in several chunks.
+"""
+import logging
+from collections import defaultdict
+import torch
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from sam3.train.utils.distributed import is_main_process
+try:
+    from tidecv import datasets, TIDE
+    HAS_TIDE = True
+except ImportError:
+    HAS_TIDE = False
+    print("WARNING: TIDE not installed. Detailed analysis will not be available.")
+# the COCO detection metrics (https://github.com/cocodataset/cocoapi/blob/8c9bcc3cf640524c4c20a9c40e89cb6a2f2fa0e9/PythonAPI/pycocotools/cocoeval.py#L460-L471)
+COCO_METRICS = [
+    "AP",
+    "AP_50",
+    "AP_75",
+    "AP_small",
+    "AP_medium",
+    "AP_large",
+    "AR_maxDets@1",
+    "AR_maxDets@10",
+    "AR_maxDets@100",
+    "AR_small",
+    "AR_medium",
+    "AR_large",
+]
+def convert_to_xywh(boxes):
+    """Convert bounding boxes from xyxy format to xywh format."""
+    xmin, ymin, xmax, ymax = boxes.unbind(-1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=-1)
+class HeapElement:
+    """Utility class to make a heap with a custom comparator"""
+    def __init__(self, val):
+        self.val = val
+    def __lt__(self, other):
+        return self.val["score"] < other.val["score"]
+class COCOevalCustom(COCOeval):
+    """
+    This is a slightly modified version of the original COCO API with added support for positive split evaluation.
+    """
+    def __init__(
+        self, cocoGt=None, cocoDt=None, iouType="segm", dt_only_positive=False
+    ):
+        super().__init__(cocoGt, cocoDt, iouType)
+        self.dt_only_positive = dt_only_positive
+    def _prepare(self):
+        """
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        """
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                rle = coco.annToRLE(ann)
+                ann["segmentation"] = rle
+        p = self.params
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(
+                self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+            )
+            dts = self.cocoDt.loadAnns(
+                self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
+            )
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == "segm":
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
+            gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
+            if p.iouType == "keypoints":
+                gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        _gts_cat_ids = defaultdict(set)  # gt for evaluation on positive split
+        for gt in gts:
+            self._gts[gt["image_id"], gt["category_id"]].append(gt)
+            _gts_cat_ids[gt["image_id"]].add(gt["category_id"])
+        #### BEGIN MODIFICATION ####
+        for dt in dts:
+            if (
+                self.dt_only_positive
+                and dt["category_id"] not in _gts_cat_ids[dt["image_id"]]
+            ):
+                continue
+            self._dts[dt["image_id"], dt["category_id"]].append(dt)
+        #### END MODIFICATION ####
+        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+class CocoEvaluatorOfflineWithPredFileEvaluators:
+    def __init__(
+        self,
+        gt_path,
+        tide: bool = True,
+        iou_type: str = "bbox",
+        positive_split=False,
+    ):
+        self.gt_path = gt_path
+        self.tide_enabled = HAS_TIDE and tide
+        self.positive_split = positive_split
+        self.iou_type = iou_type
+    def evaluate(self, dumped_file):
+        if not is_main_process():
+            return {}
+        logging.info("OfflineCoco evaluator: Loading groundtruth")
+        self.gt = COCO(self.gt_path)
+        # Creating the result file
+        logging.info("Coco evaluator: Creating the result file")
+        cocoDt = self.gt.loadRes(str(dumped_file))
+        # Run the evaluation
+        logging.info("Coco evaluator: Running evaluation")
+        coco_eval = COCOevalCustom(
+            self.gt, cocoDt, iouType=self.iou_type, dt_only_positive=self.positive_split
+        )
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+        outs = {}
+        for i, value in enumerate(coco_eval.stats):
+            outs[f"coco_eval_{self.iou_type}_{COCO_METRICS[i]}"] = value
+        if self.tide_enabled:
+            logging.info("Coco evaluator: Loading TIDE")
+            self.tide_gt = datasets.COCO(self.gt_path)
+            self.tide = TIDE(mode="mask" if self.iou_type == "segm" else "bbox")
+            # Run TIDE
+            logging.info("Coco evaluator: Running TIDE")
+            self.tide.evaluate(
+                self.tide_gt, datasets.COCOResult(str(dumped_file)), name="coco_eval"
+            )
+            self.tide.summarize()
+            for k, v in self.tide.get_main_errors()["coco_eval"].items():
+                outs[f"coco_eval_{self.iou_type}_TIDE_{k}"] = v
+            for k, v in self.tide.get_special_errors()["coco_eval"].items():
+                outs[f"coco_eval_{self.iou_type}_TIDE_{k}"] = v
+        return outs

sam3/eval/coco_reindex.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""
+Self-contained COCO JSON re-indexing function that creates temporary files.
+"""
+import json
+import os
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+def reindex_coco_to_temp(input_json_path: str) -> Optional[str]:
+    """
+    Convert 0-indexed COCO JSON file to 1-indexed and save to temporary location.
+    Args:
+        input_json_path: Path to the input COCO JSON file
+    Returns:
+        Path to the new 1-indexed JSON file in temporary directory, or None if no conversion needed
+    Raises:
+        FileNotFoundError: If input file doesn't exist
+        json.JSONDecodeError: If input file is not valid JSON
+        ValueError: If input file is not a valid COCO format
+    """
+    def is_coco_json(data: Dict[str, Any]) -> bool:
+        """Check if data appears to be a COCO format file."""
+        if not isinstance(data, dict):
+            return False
+        # A COCO file should have at least one of these keys
+        coco_keys = {"images", "annotations", "categories"}
+        return any(key in data for key in coco_keys)
+    def check_zero_indexed(data: Dict[str, Any]) -> Tuple[bool, bool, bool]:
+        """
+        Check if annotations, images, or categories start from index 0.
+        Returns:
+            Tuple of (annotations_zero_indexed, images_zero_indexed, categories_zero_indexed)
+        """
+        annotations_zero = False
+        images_zero = False
+        categories_zero = False
+        # Check annotations
+        annotations = data.get("annotations", [])
+        if annotations and any(ann.get("id", -1) == 0 for ann in annotations):
+            annotations_zero = True
+        # Check images
+        images = data.get("images", [])
+        if images and any(img.get("id", -1) == 0 for img in images):
+            images_zero = True
+        # Check categories
+        categories = data.get("categories", [])
+        if categories and any(cat.get("id", -1) == 0 for cat in categories):
+            categories_zero = True
+        return annotations_zero, images_zero, categories_zero
+    def reindex_coco_data(data: Dict[str, Any]) -> Dict[str, Any]:
+        """Convert 0-indexed COCO data to 1-indexed."""
+        modified_data = data.copy()
+        annotations_zero, images_zero, categories_zero = check_zero_indexed(data)
+        # Create ID mapping for consistency
+        image_id_mapping = {}
+        category_id_mapping = {}
+        # Process images first (since annotations reference image IDs)
+        if images_zero and "images" in modified_data:
+            for img in modified_data["images"]:
+                old_id = img["id"]
+                new_id = old_id + 1
+                image_id_mapping[old_id] = new_id
+                img["id"] = new_id
+        # Process categories (since annotations reference category IDs)
+        if categories_zero and "categories" in modified_data:
+            for cat in modified_data["categories"]:
+                old_id = cat["id"]
+                new_id = old_id + 1
+                category_id_mapping[old_id] = new_id
+                cat["id"] = new_id
+        # Process annotations
+        if "annotations" in modified_data:
+            for ann in modified_data["annotations"]:
+                # Update annotation ID if needed
+                if annotations_zero:
+                    ann["id"] = ann["id"] + 1
+                # Update image_id reference if images were reindexed
+                if images_zero and ann.get("image_id") is not None:
+                    old_image_id = ann["image_id"]
+                    if old_image_id in image_id_mapping:
+                        ann["image_id"] = image_id_mapping[old_image_id]
+                # Update category_id reference if categories were reindexed
+                if categories_zero and ann.get("category_id") is not None:
+                    old_category_id = ann["category_id"]
+                    if old_category_id in category_id_mapping:
+                        ann["category_id"] = category_id_mapping[old_category_id]
+        return modified_data
+    # Validate input path
+    if not os.path.exists(input_json_path):
+        raise FileNotFoundError(f"Input file not found: {input_json_path}")
+    # Load and validate JSON data
+    try:
+        with open(input_json_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    except json.JSONDecodeError as e:
+        raise json.JSONDecodeError(f"Invalid JSON in {input_json_path}: {e}")
+    # Validate COCO format
+    if not is_coco_json(data):
+        raise ValueError(
+            f"File does not appear to be in COCO format: {input_json_path}"
+        )
+    # Check if reindexing is needed
+    annotations_zero, images_zero, categories_zero = check_zero_indexed(data)
+    if not (annotations_zero or images_zero or categories_zero):
+        # No conversion needed - just copy to temp location
+        input_path = Path(input_json_path)
+        temp_dir = tempfile.mkdtemp()
+        temp_filename = f"{input_path.stem}_1_indexed{input_path.suffix}"
+        temp_path = os.path.join(temp_dir, temp_filename)
+        with open(temp_path, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        return temp_path
+    # Perform reindexing
+    modified_data = reindex_coco_data(data)
+    # Create temporary file
+    input_path = Path(input_json_path)
+    temp_dir = tempfile.mkdtemp()
+    temp_filename = f"{input_path.stem}_1_indexed{input_path.suffix}"
+    temp_path = os.path.join(temp_dir, temp_filename)
+    # Write modified data to temporary file
+    with open(temp_path, "w", encoding="utf-8") as f:
+        json.dump(modified_data, f, indent=2, ensure_ascii=False)
+    return temp_path
+# Example usage and test function
+def test_reindex_function():
+    """Test the reindex function with a sample COCO file."""
+    # Create a test COCO file
+    test_data = {
+        "info": {"description": "Test COCO dataset", "version": "1.0", "year": 2023},
+        "images": [
+            {"id": 0, "width": 640, "height": 480, "file_name": "test1.jpg"},
+            {"id": 1, "width": 640, "height": 480, "file_name": "test2.jpg"},
+        ],
+        "categories": [
+            {"id": 0, "name": "person", "supercategory": "person"},
+            {"id": 1, "name": "car", "supercategory": "vehicle"},
+        ],
+        "annotations": [
+            {
+                "id": 0,
+                "image_id": 0,
+                "category_id": 0,
+                "bbox": [100, 100, 50, 75],
+                "area": 3750,
+                "iscrowd": 0,
+            },
+            {
+                "id": 1,
+                "image_id": 1,
+                "category_id": 1,
+                "bbox": [200, 150, 120, 80],
+                "area": 9600,
+                "iscrowd": 0,
+            },
+        ],
+    }
+    # Create temporary test file
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(test_data, f, indent=2)
+        test_file_path = f.name
+    try:
+        # Test the function
+        result_path = reindex_coco_to_temp(test_file_path)
+        print(f"Original file: {test_file_path}")
+        print(f"Converted file: {result_path}")
+        # Load and display the result
+        with open(result_path, "r") as f:
+            result_data = json.load(f)
+        print("\nConverted data sample:")
+        print(f"First image ID: {result_data['images'][0]['id']}")
+        print(f"First category ID: {result_data['categories'][0]['id']}")
+        print(f"First annotation ID: {result_data['annotations'][0]['id']}")
+        print(f"First annotation image_id: {result_data['annotations'][0]['image_id']}")
+        print(
+            f"First annotation category_id: {result_data['annotations'][0]['category_id']}"
+        )
+        # Clean up
+        os.unlink(result_path)
+        os.rmdir(os.path.dirname(result_path))
+    finally:
+        # Clean up test file
+        os.unlink(test_file_path)
+if __name__ == "__main__":
+    test_reindex_function()

sam3/eval/coco_writer.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""
+COCO prediction dumper for distributed training.
+Handles collection and dumping of COCO-format predictions from models.
+Supports distributed processing with multiple GPUs/processes.
+"""
+import copy
+import gc
+import heapq
+import json
+import logging
+import os
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Optional
+import pycocotools.mask as mask_utils
+import torch
+from iopath.common.file_io import g_pathmgr
+from sam3.eval.coco_eval_offline import convert_to_xywh
+from sam3.train.masks_ops import rle_encode
+from sam3.train.utils.distributed import (
+    all_gather,
+    gather_to_rank_0_via_filesys,
+    get_rank,
+    is_main_process,
+)
+### Helper functions and classes
+class HeapElement:
+    """Utility class to make a heap with a custom comparator based on score."""
+    def __init__(self, val):
+        self.val = val
+    def __lt__(self, other):
+        return self.val["score"] < other.val["score"]
+class PredictionDumper:
+    """
+    Handles collection and dumping of COCO-format predictions from a model.
+    This class processes model outputs through a postprocessor, converts them to COCO format,
+    and saves them to disk. It supports distributed processing with multiple GPUs/processes.
+    """
+    def __init__(
+        self,
+        dump_dir: str,
+        postprocessor,
+        maxdets: int,
+        iou_type: str,
+        gather_pred_via_filesys: bool = False,
+        merge_predictions: bool = False,
+        pred_file_evaluators: Optional[Any] = None,
+    ):
+        """
+        Initialize the PredictionDumper.
+        Args:
+            dump_dir: Directory to dump predictions.
+            postprocessor: Module to convert the model's output into COCO format.
+            maxdets: Maximum number of detections per image.
+            iou_type: IoU type to evaluate. Can include "bbox", "segm"
+            gather_pred_via_filesys: If True, use the filesystem for collective gathers across
+                processes (requires a shared filesystem). Otherwise, use torch collective ops.
+            merge_predictions: If True, merge predictions from all processes and dump to a single file.
+        """
+        self.iou_type = iou_type
+        self.maxdets = maxdets
+        self.dump_dir = dump_dir
+        self.postprocessor = postprocessor
+        self.gather_pred_via_filesys = gather_pred_via_filesys
+        self.merge_predictions = merge_predictions
+        self.pred_file_evaluators = pred_file_evaluators
+        if self.pred_file_evaluators is not None:
+            assert (
+                merge_predictions
+            ), "merge_predictions must be True if pred_file_evaluators are provided"
+        assert self.dump_dir is not None, "dump_dir must be provided"
+        if is_main_process():
+            os.makedirs(self.dump_dir, exist_ok=True)
+            logging.info(f"Created prediction dump directory: {self.dump_dir}")
+        # Initialize state
+        self.reset()
+    def update(self, *args, **kwargs):
+        """
+        Process and accumulate predictions from model outputs.
+        Args:
+            *args, **kwargs: Arguments passed to postprocessor.process_results()
+        """
+        predictions = self.postprocessor.process_results(*args, **kwargs)
+        results = self.prepare(predictions, self.iou_type)
+        self._dump(results)
+    def _dump(self, results):
+        """
+        Add results to the dump list with precision rounding.
+        Args:
+            results: List of prediction dictionaries in COCO format.
+        """
+        dumped_results = copy.deepcopy(results)
+        for r in dumped_results:
+            if "bbox" in r:
+                r["bbox"] = [round(coord, 5) for coord in r["bbox"]]
+            r["score"] = round(r["score"], 5)
+        self.dump.extend(dumped_results)
+    def synchronize_between_processes(self):
+        """
+        Synchronize predictions across all processes and save to disk.
+        If gather_pred_via_filesys is True, uses filesystem for gathering.
+        Otherwise, uses torch distributed collective operations.
+        Saves per-rank predictions to separate JSON files.
+        """
+        logging.info("Prediction Dumper: Synchronizing between processes")
+        if not self.merge_predictions:
+            dumped_file = (
+                Path(self.dump_dir)
+                / f"coco_predictions_{self.iou_type}_{get_rank()}.json"
+            )
+            logging.info(
+                f"Prediction Dumper: Dumping local predictions to {dumped_file}"
+            )
+            with g_pathmgr.open(str(dumped_file), "w") as f:
+                json.dump(self.dump, f)
+        else:
+            self.dump = self.gather_and_merge_predictions()
+            dumped_file = Path(self.dump_dir) / f"coco_predictions_{self.iou_type}.json"
+            if is_main_process():
+                logging.info(
+                    f"Prediction Dumper: Dumping merged predictions to {dumped_file}"
+                )
+                with g_pathmgr.open(str(dumped_file), "w") as f:
+                    json.dump(self.dump, f)
+        self.reset()
+        return dumped_file
+    def gather_and_merge_predictions(self):
+        """
+        Gather predictions from all processes and merge them, keeping top predictions per image.
+        This method collects predictions from all processes, then keeps only the top maxdets
+        predictions per image based on score. It also deduplicates predictions by (image_id, category_id).
+        Returns:
+            List of merged prediction dictionaries.
+        """
+        logging.info("Prediction Dumper: Gathering predictions from all processes")
+        gc.collect()
+        if self.gather_pred_via_filesys:
+            dump = gather_to_rank_0_via_filesys(self.dump)
+        else:
+            dump = all_gather(self.dump, force_cpu=True)
+        # Combine predictions, keeping only top maxdets per image
+        preds_by_image = defaultdict(list)
+        seen_img_cat = set()
+        for cur_dump in dump:
+            cur_seen_img_cat = set()
+            for p in cur_dump:
+                image_id = p["image_id"]
+                cat_id = p["category_id"]
+                # Skip if we've already seen this image/category pair in a previous dump
+                if (image_id, cat_id) in seen_img_cat:
+                    continue
+                cur_seen_img_cat.add((image_id, cat_id))
+                # Use a min-heap to keep top predictions
+                if len(preds_by_image[image_id]) < self.maxdets:
+                    heapq.heappush(preds_by_image[image_id], HeapElement(p))
+                else:
+                    heapq.heappushpop(preds_by_image[image_id], HeapElement(p))
+            seen_img_cat.update(cur_seen_img_cat)
+        # Flatten the heap elements back to a list
+        merged_dump = sum(
+            [[h.val for h in cur_preds] for cur_preds in preds_by_image.values()], []
+        )
+        return merged_dump
+    def compute_synced(self):
+        """
+        Synchronize predictions across processes and compute summary.
+        Returns:
+            Summary dictionary from summarize().
+        """
+        dumped_file = self.synchronize_between_processes()
+        if not is_main_process():
+            return {"": 0.0}
+        meters = {}
+        if self.pred_file_evaluators is not None:
+            for evaluator in self.pred_file_evaluators:
+                results = evaluator.evaluate(dumped_file)
+                meters.update(results)
+        if len(meters) == 0:
+            meters = {"": 0.0}
+        return meters
+    def compute(self):
+        """
+        Compute without synchronization.
+        Returns:
+            Empty metric dictionary.
+        """
+        return {"": 0.0}
+    def reset(self):
+        """Reset internal state for a new evaluation round."""
+        self.dump = []
+    def prepare(self, predictions, iou_type):
+        """
+        Route predictions to the appropriate preparation method based on iou_type.
+        Args:
+            predictions: Dictionary mapping image IDs to prediction dictionaries.
+            iou_type: Type of evaluation ("bbox", "segm").
+        Returns:
+            List of COCO-format prediction dictionaries.
+        """
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        else:
+            raise ValueError(f"Unknown iou type: {iou_type}")
+    def prepare_for_coco_detection(self, predictions):
+        """
+        Convert predictions to COCO detection format.
+        Args:
+            predictions: Dictionary mapping image IDs to prediction dictionaries
+                containing "boxes", "scores", and "labels".
+        Returns:
+            List of COCO-format detection dictionaries.
+        """
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+    @torch.no_grad()
+    def prepare_for_coco_segmentation(self, predictions):
+        """
+        Convert predictions to COCO segmentation format.
+        Args:
+            predictions: Dictionary mapping image IDs to prediction dictionaries
+                containing "masks" or "masks_rle", "scores", and "labels".
+                Optionally includes "boundaries" and "dilated_boundaries".
+        Returns:
+            List of COCO-format segmentation dictionaries with RLE-encoded masks.
+        """
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            boxes = None
+            if "boxes" in prediction:
+                boxes = prediction["boxes"]
+                boxes = convert_to_xywh(boxes).tolist()
+                assert len(boxes) == len(scores)
+            if "masks_rle" in prediction:
+                rles = prediction["masks_rle"]
+                areas = []
+                for rle in rles:
+                    cur_area = mask_utils.area(rle)
+                    h, w = rle["size"]
+                    areas.append(cur_area / (h * w))
+            else:
+                masks = prediction["masks"]
+                masks = masks > 0.5
+                h, w = masks.shape[-2:]
+                areas = masks.flatten(1).sum(1) / (h * w)
+                areas = areas.tolist()
+                rles = rle_encode(masks.squeeze(1))
+                # Memory cleanup
+                del masks
+                del prediction["masks"]
+            assert len(areas) == len(rles) == len(scores)
+            for k, rle in enumerate(rles):
+                payload = {
+                    "image_id": original_id,
+                    "category_id": labels[k],
+                    "segmentation": rle,
+                    "score": scores[k],
+                    "area": areas[k],
+                }
+                if boxes is not None:
+                    payload["bbox"] = boxes[k]
+                coco_results.append(payload)
+        return coco_results

sam3/eval/conversion_util.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import json
+import os
+from collections import defaultdict
+from tqdm import tqdm
+def convert_ytbvis_to_cocovid_gt(ann_json, save_path=None):
+    """Convert YouTube VIS dataset to COCO-style video instance segmentation format.
+    Args:
+        ann_json (str): Path to YouTube VIS annotation JSON file
+        save_path (str): path to save converted COCO-style JSON
+    """
+    # Initialize COCO structure
+    VIS = {
+        "info": {},
+        "images": [],
+        "videos": [],
+        "tracks": [],
+        "annotations": [],
+        "categories": [],
+        "licenses": [],
+    }
+    # Load original annotations
+    official_anns = json.load(open(ann_json))
+    VIS["categories"] = official_anns["categories"]  # Direct copy categories
+    # Initialize counters
+    records = dict(img_id=1, ann_id=1)
+    # Create video-to-annotations mapping
+    vid_to_anns = defaultdict(list)
+    for ann in official_anns["annotations"]:
+        vid_to_anns[ann["video_id"]].append(ann)
+    # Create tracks directly
+    VIS["tracks"] = [
+        {
+            "id": ann["id"],
+            "category_id": ann["category_id"],
+            "video_id": ann["video_id"],
+        }
+        for ann in official_anns["annotations"]
+    ]
+    # Process videos
+    for video_info in tqdm(official_anns["videos"]):
+        # Create video entry
+        video = {
+            "id": video_info["id"],
+            "name": os.path.dirname(video_info["file_names"][0]),
+            "width": video_info["width"],
+            "height": video_info["height"],
+            "length": video_info["length"],
+            "neg_category_ids": [],
+            "not_exhaustive_category_ids": [],
+        }
+        VIS["videos"].append(video)
+        # Process frames
+        num_frames = len(video_info["file_names"])
+        for frame_idx in range(num_frames):
+            # Create image entry
+            image = {
+                "id": records["img_id"],
+                "video_id": video_info["id"],
+                "file_name": video_info["file_names"][frame_idx],
+                "width": video_info["width"],
+                "height": video_info["height"],
+                "frame_index": frame_idx,
+                "frame_id": frame_idx,
+            }
+            VIS["images"].append(image)
+            # Process annotations for this frame
+            if video_info["id"] in vid_to_anns:
+                for ann in vid_to_anns[video_info["id"]]:
+                    bbox = ann["bboxes"][frame_idx]
+                    if bbox is None:
+                        continue
+                    # Create annotation entry
+                    annotation = {
+                        "id": records["ann_id"],
+                        "video_id": video_info["id"],
+                        "image_id": records["img_id"],
+                        "track_id": ann["id"],
+                        "category_id": ann["category_id"],
+                        "bbox": bbox,
+                        "area": ann["areas"][frame_idx],
+                        "segmentation": ann["segmentations"][frame_idx],
+                        "iscrowd": ann["iscrowd"],
+                    }
+                    VIS["annotations"].append(annotation)
+                    records["ann_id"] += 1
+            records["img_id"] += 1
+    # Print summary
+    print(f"Converted {len(VIS['videos'])} videos")
+    print(f"Converted {len(VIS['images'])} images")
+    print(f"Created {len(VIS['tracks'])} tracks")
+    print(f"Created {len(VIS['annotations'])} annotations")
+    if save_path is None:
+        return VIS
+    # Save output
+    save_dir = os.path.dirname(save_path)
+    os.makedirs(save_dir, exist_ok=True)
+    json.dump(VIS, open(save_path, "w"))
+    return VIS
+def convert_ytbvis_to_cocovid_pred(
+    youtubevis_pred_path: str, converted_dataset_path: str, output_path: str
+) -> None:
+    """
+    Convert YouTubeVIS predictions to COCO format with video_id preservation
+    Args:
+        youtubevis_pred_path: Path to YouTubeVIS prediction JSON
+        converted_dataset_path: Path to converted COCO dataset JSON
+        output_path: Path to save COCO format predictions
+    """
+    # Load YouTubeVIS predictions
+    with open(youtubevis_pred_path) as f:
+        ytv_predictions = json.load(f)
+    # Load converted dataset for image ID mapping
+    with open(converted_dataset_path) as f:
+        coco_dataset = json.load(f)
+    # Create (video_id, frame_idx) -> image_id mapping
+    image_id_map = {
+        (img["video_id"], img["frame_index"]): img["id"]
+        for img in coco_dataset["images"]
+    }
+    coco_annotations = []
+    track_id_counter = 1  # Unique track ID generator
+    for pred in tqdm(ytv_predictions):
+        video_id = pred["video_id"]
+        category_id = pred["category_id"]
+        bboxes = pred["bboxes"]
+        segmentations = pred.get("segmentations", [])  # Get segmentations if available
+        areas = pred.get("areas", [])  # Get areas if available
+        score = pred["score"]
+        # Assign unique track ID for this prediction
+        track_id = track_id_counter
+        track_id_counter += 1
+        # Ensure segmentations and areas have the same length as bboxes
+        if len(segmentations) == 0:
+            segmentations = [None] * len(bboxes)
+        if len(areas) == 0:
+            areas = [None] * len(bboxes)
+        for frame_idx, (bbox, segmentation, area_from_pred) in enumerate(
+            zip(bboxes, segmentations, areas)
+        ):
+            # Skip frames with missing objects (None or zero bbox)
+            if bbox is None or all(x == 0 for x in bbox):
+                continue
+            # Get corresponding image ID from mapping
+            image_id = image_id_map.get((video_id, frame_idx))
+            if image_id is None:
+                raise RuntimeError(
+                    f"prediction {video_id=}, {frame_idx=} does not match any images in the converted COCO format"
+                )
+            # Extract bbox coordinates
+            x, y, w, h = bbox
+            # Calculate area - use area from prediction if available, otherwise from bbox
+            if area_from_pred is not None and area_from_pred > 0:
+                area = area_from_pred
+            else:
+                area = w * h
+            # Create COCO annotation with video_id
+            coco_annotation = {
+                "image_id": int(image_id),
+                "video_id": video_id,  # Added video_id field
+                "track_id": track_id,
+                "category_id": category_id,
+                "bbox": [float(x), float(y), float(w), float(h)],
+                "area": float(area),
+                "iscrowd": 0,
+                "score": float(score),
+            }
+            # Add segmentation if available
+            if segmentation is not None:
+                coco_annotation["segmentation"] = segmentation
+            coco_annotations.append(coco_annotation)
+    # Save output
+    with open(output_path, "w") as f:
+        json.dump(coco_annotations, f)
+    print(f"Converted {len(coco_annotations)} predictions to COCO format with video_id")

sam3/eval/demo_eval.py ADDED Viewed

	@@ -0,0 +1,658 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""
+This evaluator is based upon COCO evaluation, but evaluates the model in a "demo" setting.
+This means that the model's predictions are thresholded and evaluated as "hard" predictions.
+"""
+import logging
+from typing import Optional
+import numpy as np
+import pycocotools.mask as maskUtils
+from pycocotools.cocoeval import COCOeval
+from sam3.eval.coco_eval import CocoEvaluator
+from sam3.train.masks_ops import compute_F_measure
+from sam3.train.utils.distributed import is_main_process
+from scipy.optimize import linear_sum_assignment
+class DemoEval(COCOeval):
+    """
+    This evaluator is based upon COCO evaluation, but evaluates the model in a "demo" setting.
+    This means that the model's predictions are thresholded and evaluated as "hard" predictions.
+    """
+    def __init__(
+        self,
+        coco_gt=None,
+        coco_dt=None,
+        iouType="bbox",
+        threshold=0.5,
+        compute_JnF=False,
+    ):
+        """
+        Args:
+            coco_gt (COCO): ground truth COCO API
+            coco_dt (COCO): detections COCO API
+            iou_type (str): type of IoU to evaluate
+            threshold (float): threshold for predictions
+        """
+        super().__init__(coco_gt, coco_dt, iouType)
+        self.threshold = threshold
+        self.params.useCats = False
+        self.params.areaRng = [[0**2, 1e5**2]]
+        self.params.areaRngLbl = ["all"]
+        self.params.maxDets = [100000]
+        self.compute_JnF = compute_JnF
+    def computeIoU(self, imgId, catId):
+        # Same as the original COCOeval.computeIoU, but without sorting
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        if p.iouType == "segm":
+            g = [g["segmentation"] for g in gt]
+            d = [d["segmentation"] for d in dt]
+        elif p.iouType == "bbox":
+            g = [g["bbox"] for g in gt]
+            d = [d["bbox"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+        ious = maskUtils.iou(d, g, iscrowd)
+        return ious
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        """
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        """
+        p = self.params
+        assert not p.useCats, "This evaluator does not support per-category evaluation."
+        assert catId == -1
+        all_gts = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+        keep_gt = np.array([not g["ignore"] for g in all_gts], dtype=bool)
+        gt = [g for g in all_gts if not g["ignore"]]
+        all_dts = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        keep_dt = np.array([d["score"] >= self.threshold for d in all_dts], dtype=bool)
+        dt = [d for d in all_dts if d["score"] >= self.threshold]
+        if len(gt) == 0 and len(dt) == 0:
+            # This is a "true negative" case, where there are no GTs and no predictions
+            # The box-level metrics are ill-defined, so we don't add them to this dict
+            return {
+                "image_id": imgId,
+                "IL_TP": 0,
+                "IL_TN": 1,
+                "IL_FP": 0,
+                "IL_FN": 0,
+                "IL_perfect_neg": np.ones((len(p.iouThrs),), dtype=np.int64),
+                "num_dt": len(dt),
+            }
+        if len(gt) > 0 and len(dt) == 0:
+            # This is a "false negative" case, where there are GTs but no predictions
+            return {
+                "image_id": imgId,
+                "IL_TP": 0,
+                "IL_TN": 0,
+                "IL_FP": 0,
+                "IL_FN": 1,
+                "TPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "FPs": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "FNs": np.ones((len(p.iouThrs),), dtype=np.int64) * len(gt),
+                "local_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "local_positive_F1s": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "IL_perfect_pos": np.zeros((len(p.iouThrs),), dtype=np.int64),
+                "num_dt": len(dt),
+            }
+        # Load pre-computed ious
+        ious = self.ious[(imgId, catId)]
+        # compute matching
+        if len(ious) == 0:
+            ious = np.zeros((len(dt), len(gt)))
+        else:
+            ious = ious[keep_dt, :][:, keep_gt]
+        assert ious.shape == (len(dt), len(gt))
+        matched_dt, matched_gt = linear_sum_assignment(-ious)
+        match_scores = ious[matched_dt, matched_gt]
+        if self.compute_JnF and len(match_scores) > 0:
+            j_score = match_scores.mean()
+            f_measure = 0
+            for dt_id, gt_id in zip(matched_dt, matched_gt):
+                f_measure += compute_F_measure(
+                    gt_boundary_rle=gt[gt_id]["boundary"],
+                    gt_dilated_boundary_rle=gt[gt_id]["dilated_boundary"],
+                    dt_boundary_rle=dt[dt_id]["boundary"],
+                    dt_dilated_boundary_rle=dt[dt_id]["dilated_boundary"],
+                )
+            f_measure /= len(match_scores) + 1e-9
+            JnF = (j_score + f_measure) * 0.5
+        else:
+            j_score = f_measure = JnF = -1
+        TPs, FPs, FNs = [], [], []
+        IL_perfect = []
+        for thresh in p.iouThrs:
+            TP = (match_scores >= thresh).sum()
+            FP = len(dt) - TP
+            FN = len(gt) - TP
+            assert (
+                FP >= 0 and FN >= 0
+            ), f"FP: {FP}, FN: {FN}, TP: {TP}, match_scores: {match_scores}, len(dt): {len(dt)}, len(gt): {len(gt)}, ious: {ious}"
+            TPs.append(TP)
+            FPs.append(FP)
+            FNs.append(FN)
+            if FP == FN and FP == 0:
+                IL_perfect.append(1)
+            else:
+                IL_perfect.append(0)
+        TPs = np.array(TPs, dtype=np.int64)
+        FPs = np.array(FPs, dtype=np.int64)
+        FNs = np.array(FNs, dtype=np.int64)
+        IL_perfect = np.array(IL_perfect, dtype=np.int64)
+        # compute precision recall and F1
+        precision = TPs / (TPs + FPs + 1e-4)
+        assert np.all(precision <= 1)
+        recall = TPs / (TPs + FNs + 1e-4)
+        assert np.all(recall <= 1)
+        F1 = 2 * precision * recall / (precision + recall + 1e-4)
+        result = {
+            "image_id": imgId,
+            "TPs": TPs,
+            "FPs": FPs,
+            "FNs": FNs,
+            "local_F1s": F1,
+            "IL_TP": (len(gt) > 0) and (len(dt) > 0),
+            "IL_FP": (len(gt) == 0) and (len(dt) > 0),
+            "IL_TN": (len(gt) == 0) and (len(dt) == 0),
+            "IL_FN": (len(gt) > 0) and (len(dt) == 0),
+            ("IL_perfect_pos" if len(gt) > 0 else "IL_perfect_neg"): IL_perfect,
+            "F": f_measure,
+            "J": j_score,
+            "J&F": JnF,
+            "num_dt": len(dt),
+        }
+        if len(gt) > 0 and len(dt) > 0:
+            result["local_positive_F1s"] = F1
+        return result
+    def accumulate(self, p=None):
+        """
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        """
+        if not self.evalImgs:
+            print("Please run evaluate() first")
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        setImgIds = set(p.imgIds)
+        # TPs, FPs, FNs
+        TPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        FPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        pmFPs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        FNs = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        local_F1s = np.zeros((len(p.iouThrs),), dtype=np.float64)
+        # Image level metrics
+        IL_TPs = 0
+        IL_FPs = 0
+        IL_TNs = 0
+        IL_FNs = 0
+        IL_perfects_neg = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        IL_perfects_pos = np.zeros((len(p.iouThrs),), dtype=np.int64)
+        # JnF metric
+        total_J = 0
+        total_F = 0
+        total_JnF = 0
+        valid_img_count = 0
+        total_pos_count = 0
+        total_neg_count = 0
+        valid_J_count = 0
+        valid_F1_count = 0
+        valid_F1_count_w0dt = 0
+        for res in self.evalImgs:
+            if res["image_id"] not in setImgIds:
+                continue
+            IL_TPs += res["IL_TP"]
+            IL_FPs += res["IL_FP"]
+            IL_TNs += res["IL_TN"]
+            IL_FNs += res["IL_FN"]
+            if "IL_perfect_neg" in res:
+                IL_perfects_neg += res["IL_perfect_neg"]
+                total_neg_count += 1
+            else:
+                assert "IL_perfect_pos" in res
+                IL_perfects_pos += res["IL_perfect_pos"]
+                total_pos_count += 1
+            if "TPs" not in res:
+                continue
+            TPs += res["TPs"]
+            FPs += res["FPs"]
+            FNs += res["FNs"]
+            valid_img_count += 1
+            if "local_positive_F1s" in res:
+                local_F1s += res["local_positive_F1s"]
+                pmFPs += res["FPs"]
+                valid_F1_count_w0dt += 1
+                if res["num_dt"] > 0:
+                    valid_F1_count += 1
+            if "J" in res and res["J"] > -1e-9:
+                total_J += res["J"]
+                total_F += res["F"]
+                total_JnF += res["J&F"]
+                valid_J_count += 1
+        # compute precision recall and F1
+        precision = TPs / (TPs + FPs + 1e-4)
+        positive_micro_precision = TPs / (TPs + pmFPs + 1e-4)
+        assert np.all(precision <= 1)
+        recall = TPs / (TPs + FNs + 1e-4)
+        assert np.all(recall <= 1)
+        F1 = 2 * precision * recall / (precision + recall + 1e-4)
+        positive_micro_F1 = (
+            2
+            * positive_micro_precision
+            * recall
+            / (positive_micro_precision + recall + 1e-4)
+        )
+        IL_rec = IL_TPs / (IL_TPs + IL_FNs + 1e-6)
+        IL_prec = IL_TPs / (IL_TPs + IL_FPs + 1e-6)
+        IL_F1 = 2 * IL_prec * IL_rec / (IL_prec + IL_rec + 1e-6)
+        IL_FPR = IL_FPs / (IL_FPs + IL_TNs + 1e-6)
+        IL_MCC = float(IL_TPs * IL_TNs - IL_FPs * IL_FNs) / (
+            (
+                float(IL_TPs + IL_FPs)
+                * float(IL_TPs + IL_FNs)
+                * float(IL_TNs + IL_FPs)
+                * float(IL_TNs + IL_FNs)
+            )
+            ** 0.5
+            + 1e-6
+        )
+        IL_perfect_pos = IL_perfects_pos / (total_pos_count + 1e-9)
+        IL_perfect_neg = IL_perfects_neg / (total_neg_count + 1e-9)
+        total_J = total_J / (valid_J_count + 1e-9)
+        total_F = total_F / (valid_J_count + 1e-9)
+        total_JnF = total_JnF / (valid_J_count + 1e-9)
+        self.eval = {
+            "params": p,
+            "TPs": TPs,
+            "FPs": FPs,
+            "positive_micro_FPs": pmFPs,
+            "FNs": FNs,
+            "precision": precision,
+            "positive_micro_precision": positive_micro_precision,
+            "recall": recall,
+            "F1": F1,
+            "positive_micro_F1": positive_micro_F1,
+            "positive_macro_F1": local_F1s / valid_F1_count,
+            "positive_w0dt_macro_F1": local_F1s / valid_F1_count_w0dt,
+            "IL_recall": IL_rec,
+            "IL_precision": IL_prec,
+            "IL_F1": IL_F1,
+            "IL_FPR": IL_FPR,
+            "IL_MCC": IL_MCC,
+            "IL_perfect_pos": IL_perfect_pos,
+            "IL_perfect_neg": IL_perfect_neg,
+            "J": total_J,
+            "F": total_F,
+            "J&F": total_JnF,
+        }
+        self.eval["CGF1"] = self.eval["positive_macro_F1"] * self.eval["IL_MCC"]
+        self.eval["CGF1_w0dt"] = (
+            self.eval["positive_w0dt_macro_F1"] * self.eval["IL_MCC"]
+        )
+        self.eval["CGF1_micro"] = self.eval["positive_micro_F1"] * self.eval["IL_MCC"]
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results.
+        Note this functin can *only* be applied on the default parameter setting
+        """
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+        def _summarize(iouThr=None, metric=""):
+            p = self.params
+            iStr = " {:<18} @[ IoU={:<9}] = {:0.3f}"
+            titleStr = "Average " + metric
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+            s = self.eval[metric]
+            # IoU
+            if iouThr is not None:
+                t = np.where(iouThr == p.iouThrs)[0]
+                s = s[t]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(iStr.format(titleStr, iouStr, mean_s))
+            return mean_s
+        def _summarize_single(metric=""):
+            titleStr = "Average " + metric
+            iStr = " {:<35} = {:0.3f}"
+            s = self.eval[metric]
+            print(iStr.format(titleStr, s))
+            return s
+        def _summarizeDets():
+            # note: the index of these metrics are also used in video Demo F1 evaluation
+            # when adding new metrics, please update the index in video Demo F1 evaluation
+            # in "evaluate" method of the "VideoDemoF1Evaluator" class
+            stats = np.zeros((len(DEMO_METRICS),))
+            stats[0] = _summarize(metric="CGF1")
+            stats[1] = _summarize(metric="precision")
+            stats[2] = _summarize(metric="recall")
+            stats[3] = _summarize(metric="F1")
+            stats[4] = _summarize(metric="positive_macro_F1")
+            stats[5] = _summarize_single(metric="IL_precision")
+            stats[6] = _summarize_single(metric="IL_recall")
+            stats[7] = _summarize_single(metric="IL_F1")
+            stats[8] = _summarize_single(metric="IL_FPR")
+            stats[9] = _summarize_single(metric="IL_MCC")
+            stats[10] = _summarize(metric="IL_perfect_pos")
+            stats[11] = _summarize(metric="IL_perfect_neg")
+            stats[12] = _summarize(iouThr=0.5, metric="CGF1")
+            stats[13] = _summarize(iouThr=0.5, metric="precision")
+            stats[14] = _summarize(iouThr=0.5, metric="recall")
+            stats[15] = _summarize(iouThr=0.5, metric="F1")
+            stats[16] = _summarize(iouThr=0.5, metric="positive_macro_F1")
+            stats[17] = _summarize(iouThr=0.5, metric="IL_perfect_pos")
+            stats[18] = _summarize(iouThr=0.5, metric="IL_perfect_neg")
+            stats[19] = _summarize(iouThr=0.75, metric="CGF1")
+            stats[20] = _summarize(iouThr=0.75, metric="precision")
+            stats[21] = _summarize(iouThr=0.75, metric="recall")
+            stats[22] = _summarize(iouThr=0.75, metric="F1")
+            stats[23] = _summarize(iouThr=0.75, metric="positive_macro_F1")
+            stats[24] = _summarize(iouThr=0.75, metric="IL_perfect_pos")
+            stats[25] = _summarize(iouThr=0.75, metric="IL_perfect_neg")
+            stats[26] = _summarize_single(metric="J")
+            stats[27] = _summarize_single(metric="F")
+            stats[28] = _summarize_single(metric="J&F")
+            stats[29] = _summarize(metric="CGF1_micro")
+            stats[30] = _summarize(metric="positive_micro_precision")
+            stats[31] = _summarize(metric="positive_micro_F1")
+            stats[32] = _summarize(iouThr=0.5, metric="CGF1_micro")
+            stats[33] = _summarize(iouThr=0.5, metric="positive_micro_precision")
+            stats[34] = _summarize(iouThr=0.5, metric="positive_micro_F1")
+            stats[35] = _summarize(iouThr=0.75, metric="CGF1_micro")
+            stats[36] = _summarize(iouThr=0.75, metric="positive_micro_precision")
+            stats[37] = _summarize(iouThr=0.75, metric="positive_micro_F1")
+            stats[38] = _summarize(metric="CGF1_w0dt")
+            stats[39] = _summarize(metric="positive_w0dt_macro_F1")
+            stats[40] = _summarize(iouThr=0.5, metric="CGF1_w0dt")
+            stats[41] = _summarize(iouThr=0.5, metric="positive_w0dt_macro_F1")
+            stats[42] = _summarize(iouThr=0.75, metric="CGF1_w0dt")
+            stats[43] = _summarize(iouThr=0.75, metric="positive_w0dt_macro_F1")
+            return stats
+        summarize = _summarizeDets
+        self.stats = summarize()
+DEMO_METRICS = [
+    "CGF1",
+    "Precision",
+    "Recall",
+    "F1",
+    "Macro_F1",
+    "IL_Precision",
+    "IL_Recall",
+    "IL_F1",
+    "IL_FPR",
+    "IL_MCC",
+    "IL_perfect_pos",
+    "IL_perfect_neg",
+    "CGF1@0.5",
+    "Precision@0.5",
+    "Recall@0.5",
+    "F1@0.5",
+    "Macro_F1@0.5",
+    "IL_perfect_pos@0.5",
+    "IL_perfect_neg@0.5",
+    "CGF1@0.75",
+    "Precision@0.75",
+    "Recall@0.75",
+    "F1@0.75",
+    "Macro_F1@0.75",
+    "IL_perfect_pos@0.75",
+    "IL_perfect_neg@0.75",
+    "J",
+    "F",
+    "J&F",
+    "CGF1_micro",
+    "positive_micro_Precision",
+    "positive_micro_F1",
+    "CGF1_micro@0.5",
+    "positive_micro_Precision@0.5",
+    "positive_micro_F1@0.5",
+    "CGF1_micro@0.75",
+    "positive_micro_Precision@0.75",
+    "positive_micro_F1@0.75",
+    "CGF1_w0dt",
+    "positive_w0dt_macro_F1",
+    "CGF1_w0dt@0.5",
+    "positive_w0dt_macro_F1@0.5",
+    "CGF1_w0dt@0.75",
+    "positive_w0dt_macro_F1@0.75",
+]
+class DemoEvaluator(CocoEvaluator):
+    def __init__(
+        self,
+        coco_gt,
+        iou_types,
+        dump_dir: Optional[str],
+        postprocessor,
+        threshold=0.5,
+        average_by_rarity=False,
+        gather_pred_via_filesys=False,
+        exhaustive_only=False,
+        all_exhaustive_only=True,
+        compute_JnF=False,
+        metrics_dump_dir: Optional[str] = None,
+    ):
+        self.iou_types = iou_types
+        self.threshold = threshold
+        super().__init__(
+            coco_gt=coco_gt,
+            iou_types=iou_types,
+            useCats=False,
+            dump_dir=dump_dir,
+            postprocessor=postprocessor,
+            # average_by_rarity=average_by_rarity,
+            gather_pred_via_filesys=gather_pred_via_filesys,
+            exhaustive_only=exhaustive_only,
+            all_exhaustive_only=all_exhaustive_only,
+            metrics_dump_dir=metrics_dump_dir,
+        )
+        self.use_self_evaluate = True
+        self.compute_JnF = compute_JnF
+    def _lazy_init(self):
+        if self.initialized:
+            return
+        super()._lazy_init()
+        self.use_self_evaluate = True
+        self.reset()
+    def select_best_scoring(self, scorings):
+        # This function is used for "oracle" type evaluation.
+        # It accepts the evaluation results with respect to several ground truths, and picks the best
+        if len(scorings) == 1:
+            return scorings[0]
+        assert (
+            scorings[0].ndim == 3
+        ), f"Expecting results in [numCats, numAreas, numImgs] format, got {scorings[0].shape}"
+        assert (
+            scorings[0].shape[0] == 1
+        ), f"Expecting a single category, got {scorings[0].shape[0]}"
+        for scoring in scorings:
+            assert (
+                scoring.shape == scorings[0].shape
+            ), f"Shape mismatch: {scoring.shape}, {scorings[0].shape}"
+        selected_imgs = []
+        for img_id in range(scorings[0].shape[-1]):
+            best = scorings[0][:, :, img_id]
+            for scoring in scorings[1:]:
+                current = scoring[:, :, img_id]
+                if "local_F1s" in best[0, 0] and "local_F1s" in current[0, 0]:
+                    # we were able to compute a F1 score for this particular image in both evaluations
+                    # best["local_F1s"] contains the results at various IoU thresholds. We simply take the average for comparision
+                    best_score = best[0, 0]["local_F1s"].mean()
+                    current_score = current[0, 0]["local_F1s"].mean()
+                    if current_score > best_score:
+                        best = current
+                else:
+                    # If we're here, it means that in that in some evaluation we were not able to get a valid local F1
+                    # This happens when both the predictions and targets are empty. In that case, we can assume it's a perfect prediction
+                    if "local_F1s" not in current[0, 0]:
+                        best = current
+            selected_imgs.append(best)
+        result = np.stack(selected_imgs, axis=-1)
+        assert result.shape == scorings[0].shape
+        return result
+    def summarize(self):
+        self._lazy_init()
+        logging.info("Demo evaluator: Summarizing")
+        if not is_main_process():
+            return {}
+        outs = {}
+        prefix = "oracle_" if len(self.coco_evals) > 1 else ""
+        # if self.rarity_buckets is None:
+        self.accumulate(self.eval_img_ids)
+        for iou_type, coco_eval in self.coco_evals[0].items():
+            print("Demo metric, IoU type={}".format(iou_type))
+            coco_eval.summarize()
+        if "bbox" in self.coco_evals[0]:
+            for i, value in enumerate(self.coco_evals[0]["bbox"].stats):
+                outs[f"coco_eval_bbox_{prefix}{DEMO_METRICS[i]}"] = value
+        if "segm" in self.coco_evals[0]:
+            for i, value in enumerate(self.coco_evals[0]["segm"].stats):
+                outs[f"coco_eval_masks_{prefix}{DEMO_METRICS[i]}"] = value
+        # else:
+        #     total_stats = {}
+        #     for bucket, img_list in self.rarity_buckets.items():
+        #         self.accumulate(imgIds=img_list)
+        #         bucket_name = RARITY_BUCKETS[bucket]
+        #         for iou_type, coco_eval in self.coco_evals[0].items():
+        #             print(
+        #                 "Demo metric, IoU type={}, Rarity bucket={}".format(
+        #                     iou_type, bucket_name
+        #                 )
+        #             )
+        #             coco_eval.summarize()
+        #         if "bbox" in self.coco_evals[0]:
+        #             if "bbox" not in total_stats:
+        #                 total_stats["bbox"] = np.zeros_like(
+        #                     self.coco_evals[0]["bbox"].stats
+        #                 )
+        #             total_stats["bbox"] += self.coco_evals[0]["bbox"].stats
+        #             for i, value in enumerate(self.coco_evals[0]["bbox"].stats):
+        #                 outs[
+        #                     f"coco_eval_bbox_{bucket_name}_{prefix}{DEMO_METRICS[i]}"
+        #                 ] = value
+        #         if "segm" in self.coco_evals[0]:
+        #             if "segm" not in total_stats:
+        #                 total_stats["segm"] = np.zeros_like(
+        #                     self.coco_evals[0]["segm"].stats
+        #                 )
+        #             total_stats["segm"] += self.coco_evals[0]["segm"].stats
+        #             for i, value in enumerate(self.coco_evals[0]["segm"].stats):
+        #                 outs[
+        #                     f"coco_eval_masks_{bucket_name}_{prefix}{DEMO_METRICS[i]}"
+        #                 ] = value
+        #     if "bbox" in total_stats:
+        #         total_stats["bbox"] /= len(self.rarity_buckets)
+        #         for i, value in enumerate(total_stats["bbox"]):
+        #             outs[f"coco_eval_bbox_{prefix}{DEMO_METRICS[i]}"] = value
+        #     if "segm" in total_stats:
+        #         total_stats["segm"] /= len(self.rarity_buckets)
+        #         for i, value in enumerate(total_stats["segm"]):
+        #             outs[f"coco_eval_masks_{prefix}{DEMO_METRICS[i]}"] = value
+        return outs
+    def accumulate(self, imgIds=None):
+        self._lazy_init()
+        logging.info(
+            f"demo evaluator: Accumulating on {len(imgIds) if imgIds is not None else 'all'} images"
+        )
+        if not is_main_process():
+            return
+        if imgIds is not None:
+            for coco_eval in self.coco_evals[0].values():
+                coco_eval.params.imgIds = list(imgIds)
+        for coco_eval in self.coco_evals[0].values():
+            coco_eval.accumulate()
+    def reset(self):
+        self.coco_evals = [{} for _ in range(len(self.coco_gts))]
+        for i, coco_gt in enumerate(self.coco_gts):
+            for iou_type in self.iou_types:
+                self.coco_evals[i][iou_type] = DemoEval(
+                    coco_gt=coco_gt,
+                    iouType=iou_type,
+                    threshold=self.threshold,
+                    compute_JnF=self.compute_JnF,
+                )
+                self.coco_evals[i][iou_type].useCats = False
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in self.iou_types}
+        if self.dump is not None:
+            self.dump = []

sam3/eval/hota_eval_toolkit/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # flake8: noqa

sam3/eval/hota_eval_toolkit/run_ytvis_eval.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# flake8: noqa
+"""run_youtube_vis.py
+Run example:
+run_youtube_vis.py --USE_PARALLEL False --METRICS HOTA --TRACKERS_TO_EVAL STEm_Seg
+Command Line Arguments: Defaults, # Comments
+    Eval arguments:
+            'USE_PARALLEL': False,
+            'NUM_PARALLEL_CORES': 8,
+            'BREAK_ON_ERROR': True,  # Raises exception and exits with error
+            'RETURN_ON_ERROR': False,  # if not BREAK_ON_ERROR, then returns from function on error
+            'LOG_ON_ERROR': os.path.join(code_path, 'error_log.txt'),  # if not None, save any errors into a log file.
+            'PRINT_RESULTS': True,
+            'PRINT_ONLY_COMBINED': False,
+            'PRINT_CONFIG': True,
+            'TIME_PROGRESS': True,
+            'DISPLAY_LESS_PROGRESS': True,
+            'OUTPUT_SUMMARY': True,
+            'OUTPUT_EMPTY_CLASSES': True,  # If False, summary files are not output for classes with no detections
+            'OUTPUT_DETAILED': True,
+            'PLOT_CURVES': True,
+    Dataset arguments:
+        'GT_FOLDER': os.path.join(code_path, 'data/gt/youtube_vis/youtube_vis_training'),  # Location of GT data
+        'TRACKERS_FOLDER': os.path.join(code_path, 'data/trackers/youtube_vis/youtube_vis_training'),
+        # Trackers location
+        'OUTPUT_FOLDER': None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
+        'TRACKERS_TO_EVAL': None,  # Filenames of trackers to eval (if None, all in folder)
+        'CLASSES_TO_EVAL': None,  # Classes to eval (if None, all classes)
+        'SPLIT_TO_EVAL': 'training',  # Valid: 'training', 'val'
+        'PRINT_CONFIG': True,  # Whether to print current config
+        'OUTPUT_SUB_FOLDER': '',  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+        'TRACKER_SUB_FOLDER': 'data',  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+        'TRACKER_DISPLAY_NAMES': None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
+    Metric arguments:
+        'METRICS': ['TrackMAP', 'HOTA', 'CLEAR', 'Identity']
+"""
+import argparse
+import os
+import sys
+from multiprocessing import freeze_support
+from . import trackeval
+def run_ytvis_eval(args=None, gt_json=None, dt_json=None):
+    # Command line interface:
+    default_eval_config = trackeval.Evaluator.get_default_eval_config()
+    # print only combined since TrackMAP is undefined for per sequence breakdowns
+    default_eval_config["PRINT_ONLY_COMBINED"] = True
+    default_dataset_config = trackeval.datasets.YouTubeVIS.get_default_dataset_config()
+    default_metrics_config = {"METRICS": ["HOTA"]}
+    config = {
+        **default_eval_config,
+        **default_dataset_config,
+        **default_metrics_config,
+    }  # Merge default configs
+    parser = argparse.ArgumentParser()
+    for setting in config.keys():
+        if type(config[setting]) == list or type(config[setting]) == type(None):
+            parser.add_argument("--" + setting, nargs="+")
+        else:
+            parser.add_argument("--" + setting)
+    args = parser.parse_args(args).__dict__
+    for setting in args.keys():
+        if args[setting] is not None:
+            if type(config[setting]) == type(True):
+                if args[setting] == "True":
+                    x = True
+                elif args[setting] == "False":
+                    x = False
+                else:
+                    raise Exception(
+                        "Command line parameter " + setting + "must be True or False"
+                    )
+            elif type(config[setting]) == type(1):
+                x = int(args[setting])
+            elif type(args[setting]) == type(None):
+                x = None
+            else:
+                x = args[setting]
+            config[setting] = x
+    eval_config = {k: v for k, v in config.items() if k in default_eval_config.keys()}
+    dataset_config = {
+        k: v for k, v in config.items() if k in default_dataset_config.keys()
+    }
+    metrics_config = {
+        k: v for k, v in config.items() if k in default_metrics_config.keys()
+    }
+    # Run code
+    evaluator = trackeval.Evaluator(eval_config)
+    # allow directly specifying the GT JSON data and Tracker (result)
+    # JSON data as Python objects, without reading from files.
+    dataset_config["GT_JSON_OBJECT"] = gt_json
+    dataset_config["TRACKER_JSON_OBJECT"] = dt_json
+    dataset_list = [trackeval.datasets.YouTubeVIS(dataset_config)]
+    metrics_list = []
+    # for metric in [trackeval.metrics.TrackMAP, trackeval.metrics.HOTA, trackeval.metrics.CLEAR,
+    #                trackeval.metrics.Identity]:
+    for metric in [trackeval.metrics.HOTA]:
+        if metric.get_name() in metrics_config["METRICS"]:
+            metrics_list.append(metric())
+    if len(metrics_list) == 0:
+        raise Exception("No metrics selected for evaluation")
+    output_res, output_msg = evaluator.evaluate(dataset_list, metrics_list)
+    return output_res, output_msg
+if __name__ == "__main__":
+    import sys
+    freeze_support()
+    run_ytvis_eval(sys.argv[1:])

sam3/eval/hota_eval_toolkit/trackeval/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# flake8: noqa
+from . import datasets, metrics, utils
+from .eval import Evaluator

sam3/eval/hota_eval_toolkit/trackeval/_timing.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# flake8: noqa
+import inspect
+from functools import wraps
+from time import perf_counter
+DO_TIMING = False
+DISPLAY_LESS_PROGRESS = False
+timer_dict = {}
+counter = 0
+def time(f):
+    @wraps(f)
+    def wrap(*args, **kw):
+        if DO_TIMING:
+            # Run function with timing
+            ts = perf_counter()
+            result = f(*args, **kw)
+            te = perf_counter()
+            tt = te - ts
+            # Get function name
+            arg_names = inspect.getfullargspec(f)[0]
+            if arg_names[0] == "self" and DISPLAY_LESS_PROGRESS:
+                return result
+            elif arg_names[0] == "self":
+                method_name = type(args[0]).__name__ + "." + f.__name__
+            else:
+                method_name = f.__name__
+            # Record accumulative time in each function for analysis
+            if method_name in timer_dict.keys():
+                timer_dict[method_name] += tt
+            else:
+                timer_dict[method_name] = tt
+            # If code is finished, display timing summary
+            if method_name == "Evaluator.evaluate":
+                print("")
+                print("Timing analysis:")
+                for key, value in timer_dict.items():
+                    print("%-70s %2.4f sec" % (key, value))
+            else:
+                # Get function argument values for printing special arguments of interest
+                arg_titles = ["tracker", "seq", "cls"]
+                arg_vals = []
+                for i, a in enumerate(arg_names):
+                    if a in arg_titles:
+                        arg_vals.append(args[i])
+                arg_text = "(" + ", ".join(arg_vals) + ")"
+                # Display methods and functions with different indentation.
+                if arg_names[0] == "self":
+                    print("%-74s %2.4f sec" % (" " * 4 + method_name + arg_text, tt))
+                elif arg_names[0] == "test":
+                    pass
+                else:
+                    global counter
+                    counter += 1
+                    print("%i %-70s %2.4f sec" % (counter, method_name + arg_text, tt))
+            return result
+        else:
+            # If config["TIME_PROGRESS"] is false, or config["USE_PARALLEL"] is true, run functions normally without timing.
+            return f(*args, **kw)
+    return wrap

sam3/eval/hota_eval_toolkit/trackeval/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# flake8: noqa
+from .tao_ow import TAO_OW
+from .youtube_vis import YouTubeVIS

sam3/eval/hota_eval_toolkit/trackeval/datasets/_base_dataset.py ADDED Viewed

	@@ -0,0 +1,379 @@

+# flake8: noqa
+import csv
+import io
+import os
+import traceback
+import zipfile
+from abc import ABC, abstractmethod
+from copy import deepcopy
+import numpy as np
+from .. import _timing
+from ..utils import TrackEvalException
+class _BaseDataset(ABC):
+    @abstractmethod
+    def __init__(self):
+        self.tracker_list = None
+        self.seq_list = None
+        self.class_list = None
+        self.output_fol = None
+        self.output_sub_fol = None
+        self.should_classes_combine = True
+        self.use_super_categories = False
+    # Functions to implement:
+    @staticmethod
+    @abstractmethod
+    def get_default_dataset_config(): ...
+    @abstractmethod
+    def _load_raw_file(self, tracker, seq, is_gt): ...
+    @_timing.time
+    @abstractmethod
+    def get_preprocessed_seq_data(self, raw_data, cls): ...
+    @abstractmethod
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t): ...
+    # Helper functions for all datasets:
+    @classmethod
+    def get_class_name(cls):
+        return cls.__name__
+    def get_name(self):
+        return self.get_class_name()
+    def get_output_fol(self, tracker):
+        return os.path.join(self.output_fol, tracker, self.output_sub_fol)
+    def get_display_name(self, tracker):
+        """Can be overwritten if the trackers name (in files) is different to how it should be displayed.
+        By default this method just returns the trackers name as is.
+        """
+        return tracker
+    def get_eval_info(self):
+        """Return info about the dataset needed for the Evaluator"""
+        return self.tracker_list, self.seq_list, self.class_list
+    @_timing.time
+    def get_raw_seq_data(self, tracker, seq):
+        """Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
+        Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
+        A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
+        the evaluation of each class.
+        This returns a dict which contains the fields:
+        [num_timesteps]: integer
+        [gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
+                                                                list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
+        [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        [gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
+        gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
+        Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
+        independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
+        masks vs 2D boxes vs 3D boxes).
+        We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
+        we don't wish to calculate this twice.
+        We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
+        calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
+        """
+        # Load raw data.
+        raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
+        raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
+        raw_data = {**raw_tracker_data, **raw_gt_data}  # Merges dictionaries
+        # Calculate similarities for each timestep.
+        similarity_scores = []
+        for t, (gt_dets_t, tracker_dets_t) in enumerate(
+            zip(raw_data["gt_dets"], raw_data["tracker_dets"])
+        ):
+            ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
+            similarity_scores.append(ious)
+        raw_data["similarity_scores"] = similarity_scores
+        return raw_data
+    @staticmethod
+    def _load_simple_text_file(
+        file,
+        time_col=0,
+        id_col=None,
+        remove_negative_ids=False,
+        valid_filter=None,
+        crowd_ignore_filter=None,
+        convert_filter=None,
+        is_zipped=False,
+        zip_file=None,
+        force_delimiters=None,
+    ):
+        """Function that loads data which is in a commonly used text file format.
+        Assumes each det is given by one row of a text file.
+        There is no limit to the number or meaning of each column,
+        however one column needs to give the timestep of each det (time_col) which is default col 0.
+        The file dialect (deliminator, num cols, etc) is determined automatically.
+        This function automatically separates dets by timestep,
+        and is much faster than alternatives such as np.loadtext or pandas.
+        If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
+        These are not excluded from ignore data.
+        valid_filter can be used to only include certain classes.
+        It is a dict with ints as keys, and lists as values,
+        such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
+        If None, all classes are included.
+        crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
+        convert_filter can be used to convert value read to another format.
+        This is used most commonly to convert classes given as string to a class id.
+        This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
+        Optionally, input files could be a zip of multiple text files for storage efficiency.
+        Returns read_data and ignore_data.
+        Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
+        Note that all data is returned as strings, and must be converted to float/int later if needed.
+        Note that timesteps will not be present in the returned dict keys if there are no dets for them
+        """
+        if remove_negative_ids and id_col is None:
+            raise TrackEvalException(
+                "remove_negative_ids is True, but id_col is not given."
+            )
+        if crowd_ignore_filter is None:
+            crowd_ignore_filter = {}
+        if convert_filter is None:
+            convert_filter = {}
+        try:
+            if is_zipped:  # Either open file directly or within a zip.
+                if zip_file is None:
+                    raise TrackEvalException(
+                        "is_zipped set to True, but no zip_file is given."
+                    )
+                archive = zipfile.ZipFile(os.path.join(zip_file), "r")
+                fp = io.TextIOWrapper(archive.open(file, "r"))
+            else:
+                fp = open(file)
+            read_data = {}
+            crowd_ignore_data = {}
+            fp.seek(0, os.SEEK_END)
+            # check if file is empty
+            if fp.tell():
+                fp.seek(0)
+                dialect = csv.Sniffer().sniff(
+                    fp.readline(), delimiters=force_delimiters
+                )  # Auto determine structure.
+                dialect.skipinitialspace = (
+                    True  # Deal with extra spaces between columns
+                )
+                fp.seek(0)
+                reader = csv.reader(fp, dialect)
+                for row in reader:
+                    try:
+                        # Deal with extra trailing spaces at the end of rows
+                        if row[-1] in "":
+                            row = row[:-1]
+                        timestep = str(int(float(row[time_col])))
+                        # Read ignore regions separately.
+                        is_ignored = False
+                        for ignore_key, ignore_value in crowd_ignore_filter.items():
+                            if row[ignore_key].lower() in ignore_value:
+                                # Convert values in one column (e.g. string to id)
+                                for (
+                                    convert_key,
+                                    convert_value,
+                                ) in convert_filter.items():
+                                    row[convert_key] = convert_value[
+                                        row[convert_key].lower()
+                                    ]
+                                # Save data separated by timestep.
+                                if timestep in crowd_ignore_data.keys():
+                                    crowd_ignore_data[timestep].append(row)
+                                else:
+                                    crowd_ignore_data[timestep] = [row]
+                                is_ignored = True
+                        if (
+                            is_ignored
+                        ):  # if det is an ignore region, it cannot be a normal det.
+                            continue
+                        # Exclude some dets if not valid.
+                        if valid_filter is not None:
+                            for key, value in valid_filter.items():
+                                if row[key].lower() not in value:
+                                    continue
+                        if remove_negative_ids:
+                            if int(float(row[id_col])) < 0:
+                                continue
+                        # Convert values in one column (e.g. string to id)
+                        for convert_key, convert_value in convert_filter.items():
+                            row[convert_key] = convert_value[row[convert_key].lower()]
+                        # Save data separated by timestep.
+                        if timestep in read_data.keys():
+                            read_data[timestep].append(row)
+                        else:
+                            read_data[timestep] = [row]
+                    except Exception:
+                        exc_str_init = (
+                            "In file %s the following line cannot be read correctly: \n"
+                            % os.path.basename(file)
+                        )
+                        exc_str = " ".join([exc_str_init] + row)
+                        raise TrackEvalException(exc_str)
+            fp.close()
+        except Exception:
+            print("Error loading file: %s, printing traceback." % file)
+            traceback.print_exc()
+            raise TrackEvalException(
+                "File %s cannot be read because it is either not present or invalidly formatted"
+                % os.path.basename(file)
+            )
+        return read_data, crowd_ignore_data
+    @staticmethod
+    def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
+        """Calculates the IOU (intersection over union) between two arrays of segmentation masks.
+        If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
+        arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
+        If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
+        used to determine if detections are within crowd ignore region.
+        :param masks1:  first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
+                        else pycocotools rle encoded format)
+        :param masks2:  second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
+                        else pycocotools rle encoded format)
+        :param is_encoded: whether the input is in pycocotools rle encoded format
+        :param do_ioa: whether to perform IoA computation
+        :return: the IoU/IoA scores
+        """
+        # Only loaded when run to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+        # use pycocotools for run length encoding of masks
+        if not is_encoded:
+            masks1 = mask_utils.encode(
+                np.array(np.transpose(masks1, (1, 2, 0)), order="F")
+            )
+            masks2 = mask_utils.encode(
+                np.array(np.transpose(masks2, (1, 2, 0)), order="F")
+            )
+        # use pycocotools for iou computation of rle encoded masks
+        ious = mask_utils.iou(masks1, masks2, [do_ioa] * len(masks2))
+        if len(masks1) == 0 or len(masks2) == 0:
+            ious = np.asarray(ious).reshape(len(masks1), len(masks2))
+        assert (ious >= 0 - np.finfo("float").eps).all()
+        assert (ious <= 1 + np.finfo("float").eps).all()
+        return ious
+    @staticmethod
+    def _calculate_box_ious(bboxes1, bboxes2, box_format="xywh", do_ioa=False):
+        """Calculates the IOU (intersection over union) between two arrays of boxes.
+        Allows variable box formats ('xywh' and 'x0y0x1y1').
+        If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
+        used to determine if detections are within crowd ignore region.
+        """
+        if box_format in "xywh":
+            # layout: (x0, y0, w, h)
+            bboxes1 = deepcopy(bboxes1)
+            bboxes2 = deepcopy(bboxes2)
+            bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
+            bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
+            bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
+            bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
+        elif box_format not in "x0y0x1y1":
+            raise (TrackEvalException("box_format %s is not implemented" % box_format))
+        # layout: (x0, y0, x1, y1)
+        min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
+        max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
+        intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(
+            min_[..., 3] - max_[..., 1], 0
+        )
+        area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+            bboxes1[..., 3] - bboxes1[..., 1]
+        )
+        if do_ioa:
+            ioas = np.zeros_like(intersection)
+            valid_mask = area1 > 0 + np.finfo("float").eps
+            ioas[valid_mask, :] = (
+                intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
+            )
+            return ioas
+        else:
+            area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+                bboxes2[..., 3] - bboxes2[..., 1]
+            )
+            union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
+            intersection[area1 <= 0 + np.finfo("float").eps, :] = 0
+            intersection[:, area2 <= 0 + np.finfo("float").eps] = 0
+            intersection[union <= 0 + np.finfo("float").eps] = 0
+            union[union <= 0 + np.finfo("float").eps] = 1
+            ious = intersection / union
+            return ious
+    @staticmethod
+    def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
+        """Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
+        measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
+        The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
+        threshold corresponds to a 1m distance threshold for TPs.
+        """
+        dist = np.linalg.norm(dets1[:, np.newaxis] - dets2[np.newaxis, :], axis=2)
+        sim = np.maximum(0, 1 - dist / zero_distance)
+        return sim
+    @staticmethod
+    def _check_unique_ids(data, after_preproc=False):
+        """Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
+        gt_ids = data["gt_ids"]
+        tracker_ids = data["tracker_ids"]
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
+            if len(tracker_ids_t) > 0:
+                unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
+                if np.max(counts) != 1:
+                    duplicate_ids = unique_ids[counts > 1]
+                    exc_str_init = (
+                        "Tracker predicts the same ID more than once in a single timestep "
+                        "(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
+                    )
+                    exc_str = (
+                        " ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
+                    )
+                    if after_preproc:
+                        exc_str_init += (
+                            "\n Note that this error occurred after preprocessing (but not before), "
+                            "so ids may not be as in file, and something seems wrong with preproc."
+                        )
+                    raise TrackEvalException(exc_str)
+            if len(gt_ids_t) > 0:
+                unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
+                if np.max(counts) != 1:
+                    duplicate_ids = unique_ids[counts > 1]
+                    exc_str_init = (
+                        "Ground-truth has the same ID more than once in a single timestep "
+                        "(seq: %s, frame: %i, ids:" % (data["seq"], t + 1)
+                    )
+                    exc_str = (
+                        " ".join([exc_str_init] + [str(d) for d in duplicate_ids]) + ")"
+                    )
+                    if after_preproc:
+                        exc_str_init += (
+                            "\n Note that this error occurred after preprocessing (but not before), "
+                            "so ids may not be as in file, and something seems wrong with preproc."
+                        )
+                    raise TrackEvalException(exc_str)

sam3/eval/hota_eval_toolkit/trackeval/datasets/tao_ow.py ADDED Viewed

	@@ -0,0 +1,891 @@

+# flake8: noqa
+import itertools
+import json
+import os
+from collections import defaultdict
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+from .. import _timing, utils
+from ..utils import TrackEvalException
+from ._base_dataset import _BaseDataset
+class TAO_OW(_BaseDataset):
+    """Dataset class for TAO tracking"""
+    @staticmethod
+    def get_default_dataset_config():
+        """Default class config values"""
+        code_path = utils.get_code_path()
+        default_config = {
+            "GT_FOLDER": os.path.join(
+                code_path, "data/gt/tao/tao_training"
+            ),  # Location of GT data
+            "TRACKERS_FOLDER": os.path.join(
+                code_path, "data/trackers/tao/tao_training"
+            ),  # Trackers location
+            "OUTPUT_FOLDER": None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
+            "TRACKERS_TO_EVAL": None,  # Filenames of trackers to eval (if None, all in folder)
+            "CLASSES_TO_EVAL": None,  # Classes to eval (if None, all classes)
+            "SPLIT_TO_EVAL": "training",  # Valid: 'training', 'val'
+            "PRINT_CONFIG": True,  # Whether to print current config
+            "TRACKER_SUB_FOLDER": "data",  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            "OUTPUT_SUB_FOLDER": "",  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            "TRACKER_DISPLAY_NAMES": None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
+            "MAX_DETECTIONS": 300,  # Number of maximal allowed detections per image (0 for unlimited)
+            "SUBSET": "all",
+        }
+        return default_config
+    def __init__(self, config=None):
+        """Initialise dataset, checking that all required files are present"""
+        super().__init__()
+        # Fill non-given config values with defaults
+        self.config = utils.init_config(
+            config, self.get_default_dataset_config(), self.get_name()
+        )
+        self.gt_fol = self.config["GT_FOLDER"]
+        self.tracker_fol = self.config["TRACKERS_FOLDER"]
+        self.should_classes_combine = True
+        self.use_super_categories = False
+        self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
+        self.output_fol = self.config["OUTPUT_FOLDER"]
+        if self.output_fol is None:
+            self.output_fol = self.tracker_fol
+        self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
+        gt_dir_files = [
+            file for file in os.listdir(self.gt_fol) if file.endswith(".json")
+        ]
+        if len(gt_dir_files) != 1:
+            raise TrackEvalException(
+                self.gt_fol + " does not contain exactly one json file."
+            )
+        with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
+            self.gt_data = json.load(f)
+        self.subset = self.config["SUBSET"]
+        if self.subset != "all":
+            # Split GT data into `known`, `unknown` or `distractor`
+            self._split_known_unknown_distractor()
+            self.gt_data = self._filter_gt_data(self.gt_data)
+        # merge categories marked with a merged tag in TAO dataset
+        self._merge_categories(self.gt_data["annotations"] + self.gt_data["tracks"])
+        # Get sequences to eval and sequence information
+        self.seq_list = [
+            vid["name"].replace("/", "-") for vid in self.gt_data["videos"]
+        ]
+        self.seq_name_to_seq_id = {
+            vid["name"].replace("/", "-"): vid["id"] for vid in self.gt_data["videos"]
+        }
+        # compute mappings from videos to annotation data
+        self.videos_to_gt_tracks, self.videos_to_gt_images = self._compute_vid_mappings(
+            self.gt_data["annotations"]
+        )
+        # compute sequence lengths
+        self.seq_lengths = {vid["id"]: 0 for vid in self.gt_data["videos"]}
+        for img in self.gt_data["images"]:
+            self.seq_lengths[img["video_id"]] += 1
+        self.seq_to_images_to_timestep = self._compute_image_to_timestep_mappings()
+        self.seq_to_classes = {
+            vid["id"]: {
+                "pos_cat_ids": list(
+                    {
+                        track["category_id"]
+                        for track in self.videos_to_gt_tracks[vid["id"]]
+                    }
+                ),
+                "neg_cat_ids": vid["neg_category_ids"],
+                "not_exhaustively_labeled_cat_ids": vid["not_exhaustive_category_ids"],
+            }
+            for vid in self.gt_data["videos"]
+        }
+        # Get classes to eval
+        considered_vid_ids = [self.seq_name_to_seq_id[vid] for vid in self.seq_list]
+        seen_cats = set(
+            [
+                cat_id
+                for vid_id in considered_vid_ids
+                for cat_id in self.seq_to_classes[vid_id]["pos_cat_ids"]
+            ]
+        )
+        # only classes with ground truth are evaluated in TAO
+        self.valid_classes = [
+            cls["name"] for cls in self.gt_data["categories"] if cls["id"] in seen_cats
+        ]
+        # cls_name_to_cls_id_map = {cls['name']: cls['id'] for cls in self.gt_data['categories']}
+        if self.config["CLASSES_TO_EVAL"]:
+            # self.class_list = [cls.lower() if cls.lower() in self.valid_classes else None
+            #                    for cls in self.config['CLASSES_TO_EVAL']]
+            self.class_list = ["object"]  # class-agnostic
+            if not all(self.class_list):
+                raise TrackEvalException(
+                    "Attempted to evaluate an invalid class. Only classes "
+                    + ", ".join(self.valid_classes)
+                    + " are valid (classes present in ground truth data)."
+                )
+        else:
+            # self.class_list = [cls for cls in self.valid_classes]
+            self.class_list = ["object"]  # class-agnostic
+        # self.class_name_to_class_id = {k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list}
+        self.class_name_to_class_id = {"object": 1}  # class-agnostic
+        # Get trackers to eval
+        if self.config["TRACKERS_TO_EVAL"] is None:
+            self.tracker_list = os.listdir(self.tracker_fol)
+        else:
+            self.tracker_list = self.config["TRACKERS_TO_EVAL"]
+        if self.config["TRACKER_DISPLAY_NAMES"] is None:
+            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
+        elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
+            len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
+        ):
+            self.tracker_to_disp = dict(
+                zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
+            )
+        else:
+            raise TrackEvalException(
+                "List of tracker files and tracker display names do not match."
+            )
+        self.tracker_data = {tracker: dict() for tracker in self.tracker_list}
+        for tracker in self.tracker_list:
+            tr_dir_files = [
+                file
+                for file in os.listdir(
+                    os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
+                )
+                if file.endswith(".json")
+            ]
+            if len(tr_dir_files) != 1:
+                raise TrackEvalException(
+                    os.path.join(self.tracker_fol, tracker, self.tracker_sub_fol)
+                    + " does not contain exactly one json file."
+                )
+            with open(
+                os.path.join(
+                    self.tracker_fol, tracker, self.tracker_sub_fol, tr_dir_files[0]
+                )
+            ) as f:
+                curr_data = json.load(f)
+            # limit detections if MAX_DETECTIONS > 0
+            if self.config["MAX_DETECTIONS"]:
+                curr_data = self._limit_dets_per_image(curr_data)
+            # fill missing video ids
+            self._fill_video_ids_inplace(curr_data)
+            # make track ids unique over whole evaluation set
+            self._make_track_ids_unique(curr_data)
+            # merge categories marked with a merged tag in TAO dataset
+            self._merge_categories(curr_data)
+            # get tracker sequence information
+            curr_videos_to_tracker_tracks, curr_videos_to_tracker_images = (
+                self._compute_vid_mappings(curr_data)
+            )
+            self.tracker_data[tracker]["vids_to_tracks"] = curr_videos_to_tracker_tracks
+            self.tracker_data[tracker]["vids_to_images"] = curr_videos_to_tracker_images
+    def get_display_name(self, tracker):
+        return self.tracker_to_disp[tracker]
+    def _load_raw_file(self, tracker, seq, is_gt):
+        """Load a file (gt or tracker) in the TAO format
+        If is_gt, this returns a dict which contains the fields:
+        [gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets]: list (for each timestep) of lists of detections.
+        [classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_lengths]: dictionary with class values
+                                as keys and lists (for each track) as values
+        if not is_gt, this returns a dict which contains the fields:
+        [tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
+        [tracker_dets]: list (for each timestep) of lists of detections.
+        [classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_dt_track_ids, classes_to_dt_track_areas, classes_to_dt_track_lengths]: dictionary with class values
+                                                                                           as keys and lists as values
+        [classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
+        """
+        seq_id = self.seq_name_to_seq_id[seq]
+        # File location
+        if is_gt:
+            imgs = self.videos_to_gt_images[seq_id]
+        else:
+            imgs = self.tracker_data[tracker]["vids_to_images"][seq_id]
+        # Convert data to required format
+        num_timesteps = self.seq_lengths[seq_id]
+        img_to_timestep = self.seq_to_images_to_timestep[seq_id]
+        data_keys = ["ids", "classes", "dets"]
+        if not is_gt:
+            data_keys += ["tracker_confidences"]
+        raw_data = {key: [None] * num_timesteps for key in data_keys}
+        for img in imgs:
+            # some tracker data contains images without any ground truth information, these are ignored
+            try:
+                t = img_to_timestep[img["id"]]
+            except KeyError:
+                continue
+            annotations = img["annotations"]
+            raw_data["dets"][t] = np.atleast_2d(
+                [ann["bbox"] for ann in annotations]
+            ).astype(float)
+            raw_data["ids"][t] = np.atleast_1d(
+                [ann["track_id"] for ann in annotations]
+            ).astype(int)
+            raw_data["classes"][t] = np.atleast_1d([1 for _ in annotations]).astype(
+                int
+            )  # class-agnostic
+            if not is_gt:
+                raw_data["tracker_confidences"][t] = np.atleast_1d(
+                    [ann["score"] for ann in annotations]
+                ).astype(float)
+        for t, d in enumerate(raw_data["dets"]):
+            if d is None:
+                raw_data["dets"][t] = np.empty((0, 4)).astype(float)
+                raw_data["ids"][t] = np.empty(0).astype(int)
+                raw_data["classes"][t] = np.empty(0).astype(int)
+                if not is_gt:
+                    raw_data["tracker_confidences"][t] = np.empty(0)
+        if is_gt:
+            key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
+        else:
+            key_map = {
+                "ids": "tracker_ids",
+                "classes": "tracker_classes",
+                "dets": "tracker_dets",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+        # all_classes = [self.class_name_to_class_id[cls] for cls in self.class_list]
+        all_classes = [1]  # class-agnostic
+        if is_gt:
+            classes_to_consider = all_classes
+            all_tracks = self.videos_to_gt_tracks[seq_id]
+        else:
+            # classes_to_consider = self.seq_to_classes[seq_id]['pos_cat_ids'] \
+            #                       + self.seq_to_classes[seq_id]['neg_cat_ids']
+            classes_to_consider = all_classes  # class-agnostic
+            all_tracks = self.tracker_data[tracker]["vids_to_tracks"][seq_id]
+        # classes_to_tracks = {cls: [track for track in all_tracks if track['category_id'] == cls]
+        #                      if cls in classes_to_consider else [] for cls in all_classes}
+        classes_to_tracks = {
+            cls: [track for track in all_tracks] if cls in classes_to_consider else []
+            for cls in all_classes
+        }  # class-agnostic
+        # mapping from classes to track information
+        raw_data["classes_to_tracks"] = {
+            cls: [
+                {
+                    det["image_id"]: np.atleast_1d(det["bbox"])
+                    for det in track["annotations"]
+                }
+                for track in tracks
+            ]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_ids"] = {
+            cls: [track["id"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_areas"] = {
+            cls: [track["area"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_lengths"] = {
+            cls: [len(track["annotations"]) for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        if not is_gt:
+            raw_data["classes_to_dt_track_scores"] = {
+                cls: np.array(
+                    [
+                        np.mean([float(x["score"]) for x in track["annotations"]])
+                        for track in tracks
+                    ]
+                )
+                for cls, tracks in classes_to_tracks.items()
+            }
+        if is_gt:
+            key_map = {
+                "classes_to_tracks": "classes_to_gt_tracks",
+                "classes_to_track_ids": "classes_to_gt_track_ids",
+                "classes_to_track_lengths": "classes_to_gt_track_lengths",
+                "classes_to_track_areas": "classes_to_gt_track_areas",
+            }
+        else:
+            key_map = {
+                "classes_to_tracks": "classes_to_dt_tracks",
+                "classes_to_track_ids": "classes_to_dt_track_ids",
+                "classes_to_track_lengths": "classes_to_dt_track_lengths",
+                "classes_to_track_areas": "classes_to_dt_track_areas",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+        raw_data["num_timesteps"] = num_timesteps
+        raw_data["neg_cat_ids"] = self.seq_to_classes[seq_id]["neg_cat_ids"]
+        raw_data["not_exhaustively_labeled_cls"] = self.seq_to_classes[seq_id][
+            "not_exhaustively_labeled_cat_ids"
+        ]
+        raw_data["seq"] = seq
+        return raw_data
+    @_timing.time
+    def get_preprocessed_seq_data(self, raw_data, cls):
+        """Preprocess data for a single sequence for a single class ready for evaluation.
+        Inputs:
+             - raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
+             - cls is the class to be evaluated.
+        Outputs:
+             - data is a dict containing all of the information that metrics need to perform evaluation.
+                It contains the following fields:
+                    [num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
+                    [gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
+                    [gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
+                    [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        Notes:
+            General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
+                1) Extract only detections relevant for the class to be evaluated (including distractor detections).
+                2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
+                    distractor class, or otherwise marked as to be removed.
+                3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
+                    other criteria (e.g. are too small).
+                4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
+            After the above preprocessing steps, this function also calculates the number of gt and tracker detections
+                and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
+                unique within each timestep.
+        TAO:
+            In TAO, the 4 preproc steps are as follow:
+                1) All classes present in the ground truth data are evaluated separately.
+                2) No matched tracker detections are removed.
+                3) Unmatched tracker detections are removed if there is not ground truth data and the class does not
+                    belong to the categories marked as negative for this sequence. Additionally, unmatched tracker
+                    detections for classes which are marked as not exhaustively labeled are removed.
+                4) No gt detections are removed.
+            Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
+            and the tracks from the tracker data are sorted according to the tracker confidence.
+        """
+        cls_id = self.class_name_to_class_id[cls]
+        is_not_exhaustively_labeled = cls_id in raw_data["not_exhaustively_labeled_cls"]
+        is_neg_category = cls_id in raw_data["neg_cat_ids"]
+        data_keys = [
+            "gt_ids",
+            "tracker_ids",
+            "gt_dets",
+            "tracker_dets",
+            "tracker_confidences",
+            "similarity_scores",
+        ]
+        data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
+        unique_gt_ids = []
+        unique_tracker_ids = []
+        num_gt_dets = 0
+        num_tracker_dets = 0
+        for t in range(raw_data["num_timesteps"]):
+            # Only extract relevant dets for this class for preproc and eval (cls)
+            gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
+            gt_class_mask = gt_class_mask.astype(bool)
+            gt_ids = raw_data["gt_ids"][t][gt_class_mask]
+            gt_dets = raw_data["gt_dets"][t][gt_class_mask]
+            tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
+            tracker_class_mask = tracker_class_mask.astype(bool)
+            tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
+            tracker_dets = raw_data["tracker_dets"][t][tracker_class_mask]
+            tracker_confidences = raw_data["tracker_confidences"][t][tracker_class_mask]
+            similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
+                :, tracker_class_mask
+            ]
+            # Match tracker and gt dets (with hungarian algorithm).
+            unmatched_indices = np.arange(tracker_ids.shape[0])
+            if gt_ids.shape[0] > 0 and tracker_ids.shape[0] > 0:
+                matching_scores = similarity_scores.copy()
+                matching_scores[matching_scores < 0.5 - np.finfo("float").eps] = 0
+                match_rows, match_cols = linear_sum_assignment(-matching_scores)
+                actually_matched_mask = (
+                    matching_scores[match_rows, match_cols] > 0 + np.finfo("float").eps
+                )
+                match_cols = match_cols[actually_matched_mask]
+                unmatched_indices = np.delete(unmatched_indices, match_cols, axis=0)
+            if gt_ids.shape[0] == 0 and not is_neg_category:
+                to_remove_tracker = unmatched_indices
+            elif is_not_exhaustively_labeled:
+                to_remove_tracker = unmatched_indices
+            else:
+                to_remove_tracker = np.array([], dtype=int)
+            # remove all unwanted unmatched tracker detections
+            data["tracker_ids"][t] = np.delete(tracker_ids, to_remove_tracker, axis=0)
+            data["tracker_dets"][t] = np.delete(tracker_dets, to_remove_tracker, axis=0)
+            data["tracker_confidences"][t] = np.delete(
+                tracker_confidences, to_remove_tracker, axis=0
+            )
+            similarity_scores = np.delete(similarity_scores, to_remove_tracker, axis=1)
+            data["gt_ids"][t] = gt_ids
+            data["gt_dets"][t] = gt_dets
+            data["similarity_scores"][t] = similarity_scores
+            unique_gt_ids += list(np.unique(data["gt_ids"][t]))
+            unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
+            num_tracker_dets += len(data["tracker_ids"][t])
+            num_gt_dets += len(data["gt_ids"][t])
+        # Re-label IDs such that there are no empty IDs
+        if len(unique_gt_ids) > 0:
+            unique_gt_ids = np.unique(unique_gt_ids)
+            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
+            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["gt_ids"][t]) > 0:
+                    data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
+        if len(unique_tracker_ids) > 0:
+            unique_tracker_ids = np.unique(unique_tracker_ids)
+            tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
+            tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["tracker_ids"][t]) > 0:
+                    data["tracker_ids"][t] = tracker_id_map[
+                        data["tracker_ids"][t]
+                    ].astype(int)
+        # Record overview statistics.
+        data["num_tracker_dets"] = num_tracker_dets
+        data["num_gt_dets"] = num_gt_dets
+        data["num_tracker_ids"] = len(unique_tracker_ids)
+        data["num_gt_ids"] = len(unique_gt_ids)
+        data["num_timesteps"] = raw_data["num_timesteps"]
+        data["seq"] = raw_data["seq"]
+        # get track representations
+        data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
+        data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
+        data["gt_track_lengths"] = raw_data["classes_to_gt_track_lengths"][cls_id]
+        data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
+        data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
+        data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
+        data["dt_track_lengths"] = raw_data["classes_to_dt_track_lengths"][cls_id]
+        data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
+        data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
+        data["not_exhaustively_labeled"] = is_not_exhaustively_labeled
+        data["iou_type"] = "bbox"
+        # sort tracker data tracks by tracker confidence scores
+        if data["dt_tracks"]:
+            idx = np.argsort(
+                [-score for score in data["dt_track_scores"]], kind="mergesort"
+            )
+            data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
+            data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
+            data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
+            data["dt_track_lengths"] = [data["dt_track_lengths"][i] for i in idx]
+            data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
+        # Ensure that ids are unique per timestep.
+        self._check_unique_ids(data)
+        return data
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
+        similarity_scores = self._calculate_box_ious(gt_dets_t, tracker_dets_t)
+        return similarity_scores
+    def _merge_categories(self, annotations):
+        """
+        Merges categories with a merged tag. Adapted from https://github.com/TAO-Dataset
+        :param annotations: the annotations in which the classes should be merged
+        :return: None
+        """
+        merge_map = {}
+        for category in self.gt_data["categories"]:
+            if "merged" in category:
+                for to_merge in category["merged"]:
+                    merge_map[to_merge["id"]] = category["id"]
+        for ann in annotations:
+            ann["category_id"] = merge_map.get(ann["category_id"], ann["category_id"])
+    def _compute_vid_mappings(self, annotations):
+        """
+        Computes mappings from Videos to corresponding tracks and images.
+        :param annotations: the annotations for which the mapping should be generated
+        :return: the video-to-track-mapping, the video-to-image-mapping
+        """
+        vids_to_tracks = {}
+        vids_to_imgs = {}
+        vid_ids = [vid["id"] for vid in self.gt_data["videos"]]
+        # compute an mapping from image IDs to images
+        images = {}
+        for image in self.gt_data["images"]:
+            images[image["id"]] = image
+        for ann in annotations:
+            ann["area"] = ann["bbox"][2] * ann["bbox"][3]
+            vid = ann["video_id"]
+            if ann["video_id"] not in vids_to_tracks.keys():
+                vids_to_tracks[ann["video_id"]] = list()
+            if ann["video_id"] not in vids_to_imgs.keys():
+                vids_to_imgs[ann["video_id"]] = list()
+            # Fill in vids_to_tracks
+            tid = ann["track_id"]
+            exist_tids = [track["id"] for track in vids_to_tracks[vid]]
+            try:
+                index1 = exist_tids.index(tid)
+            except ValueError:
+                index1 = -1
+            if tid not in exist_tids:
+                curr_track = {
+                    "id": tid,
+                    "category_id": ann["category_id"],
+                    "video_id": vid,
+                    "annotations": [ann],
+                }
+                vids_to_tracks[vid].append(curr_track)
+            else:
+                vids_to_tracks[vid][index1]["annotations"].append(ann)
+            # Fill in vids_to_imgs
+            img_id = ann["image_id"]
+            exist_img_ids = [img["id"] for img in vids_to_imgs[vid]]
+            try:
+                index2 = exist_img_ids.index(img_id)
+            except ValueError:
+                index2 = -1
+            if index2 == -1:
+                curr_img = {"id": img_id, "annotations": [ann]}
+                vids_to_imgs[vid].append(curr_img)
+            else:
+                vids_to_imgs[vid][index2]["annotations"].append(ann)
+        # sort annotations by frame index and compute track area
+        for vid, tracks in vids_to_tracks.items():
+            for track in tracks:
+                track["annotations"] = sorted(
+                    track["annotations"],
+                    key=lambda x: images[x["image_id"]]["frame_index"],
+                )
+                # Computer average area
+                track["area"] = sum(x["area"] for x in track["annotations"]) / len(
+                    track["annotations"]
+                )
+        # Ensure all videos are present
+        for vid_id in vid_ids:
+            if vid_id not in vids_to_tracks.keys():
+                vids_to_tracks[vid_id] = []
+            if vid_id not in vids_to_imgs.keys():
+                vids_to_imgs[vid_id] = []
+        return vids_to_tracks, vids_to_imgs
+    def _compute_image_to_timestep_mappings(self):
+        """
+        Computes a mapping from images to the corresponding timestep in the sequence.
+        :return: the image-to-timestep-mapping
+        """
+        images = {}
+        for image in self.gt_data["images"]:
+            images[image["id"]] = image
+        seq_to_imgs_to_timestep = {vid["id"]: dict() for vid in self.gt_data["videos"]}
+        for vid in seq_to_imgs_to_timestep:
+            curr_imgs = [img["id"] for img in self.videos_to_gt_images[vid]]
+            curr_imgs = sorted(curr_imgs, key=lambda x: images[x]["frame_index"])
+            seq_to_imgs_to_timestep[vid] = {
+                curr_imgs[i]: i for i in range(len(curr_imgs))
+            }
+        return seq_to_imgs_to_timestep
+    def _limit_dets_per_image(self, annotations):
+        """
+        Limits the number of detections for each image to config['MAX_DETECTIONS']. Adapted from
+        https://github.com/TAO-Dataset/
+        :param annotations: the annotations in which the detections should be limited
+        :return: the annotations with limited detections
+        """
+        max_dets = self.config["MAX_DETECTIONS"]
+        img_ann = defaultdict(list)
+        for ann in annotations:
+            img_ann[ann["image_id"]].append(ann)
+        for img_id, _anns in img_ann.items():
+            if len(_anns) <= max_dets:
+                continue
+            _anns = sorted(_anns, key=lambda x: x["score"], reverse=True)
+            img_ann[img_id] = _anns[:max_dets]
+        return [ann for anns in img_ann.values() for ann in anns]
+    def _fill_video_ids_inplace(self, annotations):
+        """
+        Fills in missing video IDs inplace. Adapted from https://github.com/TAO-Dataset/
+        :param annotations: the annotations for which the videos IDs should be filled inplace
+        :return: None
+        """
+        missing_video_id = [x for x in annotations if "video_id" not in x]
+        if missing_video_id:
+            image_id_to_video_id = {
+                x["id"]: x["video_id"] for x in self.gt_data["images"]
+            }
+            for x in missing_video_id:
+                x["video_id"] = image_id_to_video_id[x["image_id"]]
+    @staticmethod
+    def _make_track_ids_unique(annotations):
+        """
+        Makes the track IDs unqiue over the whole annotation set. Adapted from https://github.com/TAO-Dataset/
+        :param annotations: the annotation set
+        :return: the number of updated IDs
+        """
+        track_id_videos = {}
+        track_ids_to_update = set()
+        max_track_id = 0
+        for ann in annotations:
+            t = ann["track_id"]
+            if t not in track_id_videos:
+                track_id_videos[t] = ann["video_id"]
+            if ann["video_id"] != track_id_videos[t]:
+                # Track id is assigned to multiple videos
+                track_ids_to_update.add(t)
+            max_track_id = max(max_track_id, t)
+        if track_ids_to_update:
+            print("true")
+            next_id = itertools.count(max_track_id + 1)
+            new_track_ids = defaultdict(lambda: next(next_id))
+            for ann in annotations:
+                t = ann["track_id"]
+                v = ann["video_id"]
+                if t in track_ids_to_update:
+                    ann["track_id"] = new_track_ids[t, v]
+        return len(track_ids_to_update)
+    def _split_known_unknown_distractor(self):
+        all_ids = set(
+            [i for i in range(1, 2000)]
+        )  # 2000 is larger than the max category id in TAO-OW.
+        # `knowns` includes 78 TAO_category_ids that corresponds to 78 COCO classes.
+        # (The other 2 COCO classes do not have corresponding classes in TAO).
+        self.knowns = {
+            4,
+            13,
+            1038,
+            544,
+            1057,
+            34,
+            35,
+            36,
+            41,
+            45,
+            58,
+            60,
+            579,
+            1091,
+            1097,
+            1099,
+            78,
+            79,
+            81,
+            91,
+            1115,
+            1117,
+            95,
+            1122,
+            99,
+            1132,
+            621,
+            1135,
+            625,
+            118,
+            1144,
+            126,
+            642,
+            1155,
+            133,
+            1162,
+            139,
+            154,
+            174,
+            185,
+            699,
+            1215,
+            714,
+            717,
+            1229,
+            211,
+            729,
+            221,
+            229,
+            747,
+            235,
+            237,
+            779,
+            276,
+            805,
+            299,
+            829,
+            852,
+            347,
+            371,
+            382,
+            896,
+            392,
+            926,
+            937,
+            428,
+            429,
+            961,
+            452,
+            979,
+            980,
+            982,
+            475,
+            480,
+            993,
+            1001,
+            502,
+            1018,
+        }
+        # `distractors` is defined as in the paper "Opening up Open-World Tracking"
+        self.distractors = {
+            20,
+            63,
+            108,
+            180,
+            188,
+            204,
+            212,
+            247,
+            303,
+            403,
+            407,
+            415,
+            490,
+            504,
+            507,
+            513,
+            529,
+            567,
+            569,
+            588,
+            672,
+            691,
+            702,
+            708,
+            711,
+            720,
+            736,
+            737,
+            798,
+            813,
+            815,
+            827,
+            831,
+            851,
+            877,
+            883,
+            912,
+            971,
+            976,
+            1130,
+            1133,
+            1134,
+            1169,
+            1184,
+            1220,
+        }
+        self.unknowns = all_ids.difference(self.knowns.union(self.distractors))
+    def _filter_gt_data(self, raw_gt_data):
+        """
+        Filter out irrelevant data in the raw_gt_data
+        Args:
+            raw_gt_data: directly loaded from json.
+        Returns:
+            filtered gt_data
+        """
+        valid_cat_ids = list()
+        if self.subset == "known":
+            valid_cat_ids = self.knowns
+        elif self.subset == "distractor":
+            valid_cat_ids = self.distractors
+        elif self.subset == "unknown":
+            valid_cat_ids = self.unknowns
+        # elif self.subset == "test_only_unknowns":
+        #     valid_cat_ids = test_only_unknowns
+        else:
+            raise Exception("The parameter `SUBSET` is incorrect")
+        filtered = dict()
+        filtered["videos"] = raw_gt_data["videos"]
+        # filtered["videos"] = list()
+        unwanted_vid = set()
+        # for video in raw_gt_data["videos"]:
+        #     datasrc = video["name"].split('/')[1]
+        #     if datasrc in data_srcs:
+        #         filtered["videos"].append(video)
+        #     else:
+        #         unwanted_vid.add(video["id"])
+        filtered["annotations"] = list()
+        for ann in raw_gt_data["annotations"]:
+            if (ann["video_id"] not in unwanted_vid) and (
+                ann["category_id"] in valid_cat_ids
+            ):
+                filtered["annotations"].append(ann)
+        filtered["tracks"] = list()
+        for track in raw_gt_data["tracks"]:
+            if (track["video_id"] not in unwanted_vid) and (
+                track["category_id"] in valid_cat_ids
+            ):
+                filtered["tracks"].append(track)
+        filtered["images"] = list()
+        for image in raw_gt_data["images"]:
+            if image["video_id"] not in unwanted_vid:
+                filtered["images"].append(image)
+        filtered["categories"] = list()
+        for cat in raw_gt_data["categories"]:
+            if cat["id"] in valid_cat_ids:
+                filtered["categories"].append(cat)
+        filtered["info"] = raw_gt_data["info"]
+        filtered["licenses"] = raw_gt_data["licenses"]
+        return filtered

sam3/eval/hota_eval_toolkit/trackeval/datasets/youtube_vis.py ADDED Viewed

	@@ -0,0 +1,524 @@

+# flake8: noqa
+# note: this file has been modified from its original version in TrackEval in
+# https://github.com/JonathonLuiten/TrackEval/blob/master/trackeval/datasets/youtube_vis.py
+# to support the following:
+# 1) bbox evaluation (via `IOU_TYPE`)
+# 2) passing GT and prediction data as Python objects (via `GT_JSON_OBJECT` and `TRACKER_JSON_OBJECT`)
+# 3) specifying a custom dataset name (via `DATASET_NAME`)
+import json
+import os
+import numpy as np
+from .. import _timing, utils
+from ..utils import TrackEvalException
+from ._base_dataset import _BaseDataset
+class YouTubeVIS(_BaseDataset):
+    """Dataset class for YouTubeVIS tracking"""
+    @staticmethod
+    def get_default_dataset_config():
+        """Default class config values"""
+        code_path = utils.get_code_path()
+        default_config = {
+            "GT_FOLDER": os.path.join(
+                code_path, "data/gt/youtube_vis/"
+            ),  # Location of GT data
+            "TRACKERS_FOLDER": os.path.join(code_path, "data/trackers/youtube_vis/"),
+            # Trackers location
+            "OUTPUT_FOLDER": None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
+            "TRACKERS_TO_EVAL": None,  # Filenames of trackers to eval (if None, all in folder)
+            "CLASSES_TO_EVAL": None,  # Classes to eval (if None, all classes)
+            "SPLIT_TO_EVAL": "train_sub_split",  # Valid: 'train', 'val', 'train_sub_split'
+            "PRINT_CONFIG": True,  # Whether to print current config
+            "OUTPUT_SUB_FOLDER": "",  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            "TRACKER_SUB_FOLDER": "data",  # Tracker files are in TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            "TRACKER_DISPLAY_NAMES": None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
+            # Added for video phrase AP evaluation -- allow directly specifying the GT JSON data and Tracker (result)
+            # JSON data as Python objects, without reading from files.
+            "GT_JSON_OBJECT": None,
+            "TRACKER_JSON_OBJECT": None,
+            "IOU_TYPE": "segm",
+            "DATASET_NAME": "video",
+        }
+        return default_config
+    def __init__(self, config=None):
+        """Initialise dataset, checking that all required files are present"""
+        super().__init__()
+        # Fill non-given config values with defaults
+        self.config = utils.init_config(config, self.get_default_dataset_config())
+        self.gt_fol = (
+            self.config["GT_FOLDER"] + "youtube_vis_" + self.config["SPLIT_TO_EVAL"]
+        )
+        self.tracker_fol = (
+            self.config["TRACKERS_FOLDER"]
+            + "youtube_vis_"
+            + self.config["SPLIT_TO_EVAL"]
+        )
+        self.use_super_categories = False
+        self.should_classes_combine = True
+        assert self.config["IOU_TYPE"] in ["segm", "bbox"]
+        self.iou_type = self.config["IOU_TYPE"]
+        print("=" * 100)
+        print(f"Evaluate annotation type *{self.iou_type}*")
+        self.dataset_name = self.config["DATASET_NAME"]
+        self.output_fol = self.config["OUTPUT_FOLDER"]
+        if self.output_fol is None:
+            self.output_fol = self.tracker_fol
+        self.output_sub_fol = self.config["OUTPUT_SUB_FOLDER"]
+        self.tracker_sub_fol = self.config["TRACKER_SUB_FOLDER"]
+        if self.config["GT_JSON_OBJECT"] is not None:
+            # allow directly specifying the GT JSON data without reading from files
+            gt_json = self.config["GT_JSON_OBJECT"]
+            assert isinstance(gt_json, dict)
+            assert "videos" in gt_json
+            assert "categories" in gt_json
+            assert "annotations" in gt_json
+            self.gt_data = gt_json
+        else:
+            if not os.path.exists(self.gt_fol):
+                print("GT folder not found: " + self.gt_fol)
+                raise TrackEvalException(
+                    "GT folder not found: " + os.path.basename(self.gt_fol)
+                )
+            gt_dir_files = [
+                file for file in os.listdir(self.gt_fol) if file.endswith(".json")
+            ]
+            if len(gt_dir_files) != 1:
+                raise TrackEvalException(
+                    self.gt_fol + " does not contain exactly one json file."
+                )
+            with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
+                self.gt_data = json.load(f)
+        # Get classes to eval
+        self.valid_classes = [cls["name"] for cls in self.gt_data["categories"]]
+        cls_name_to_cls_id_map = {
+            cls["name"]: cls["id"] for cls in self.gt_data["categories"]
+        }
+        if self.config["CLASSES_TO_EVAL"]:
+            self.class_list = [
+                cls.lower() if cls.lower() in self.valid_classes else None
+                for cls in self.config["CLASSES_TO_EVAL"]
+            ]
+            if not all(self.class_list):
+                raise TrackEvalException(
+                    "Attempted to evaluate an invalid class. Only classes "
+                    + ", ".join(self.valid_classes)
+                    + " are valid."
+                )
+        else:
+            self.class_list = [cls["name"] for cls in self.gt_data["categories"]]
+        self.class_name_to_class_id = {
+            k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list
+        }
+        # Get sequences to eval and check gt files exist
+        self.seq_list = [
+            vid["file_names"][0].split("/")[0] for vid in self.gt_data["videos"]
+        ]
+        self.seq_name_to_seq_id = {
+            vid["file_names"][0].split("/")[0]: vid["id"]
+            for vid in self.gt_data["videos"]
+        }
+        self.seq_lengths = {
+            vid["id"]: len(vid["file_names"]) for vid in self.gt_data["videos"]
+        }
+        # encode masks and compute track areas
+        self._prepare_gt_annotations()
+        # Get trackers to eval
+        if self.config["TRACKER_JSON_OBJECT"] is not None:
+            # allow directly specifying the tracker JSON data without reading from files
+            tracker_json = self.config["TRACKER_JSON_OBJECT"]
+            assert isinstance(tracker_json, list)
+            self.tracker_list = ["tracker"]
+        elif self.config["TRACKERS_TO_EVAL"] is None:
+            self.tracker_list = os.listdir(self.tracker_fol)
+        else:
+            self.tracker_list = self.config["TRACKERS_TO_EVAL"]
+        if self.config["TRACKER_DISPLAY_NAMES"] is None:
+            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
+        elif (self.config["TRACKERS_TO_EVAL"] is not None) and (
+            len(self.config["TRACKER_DISPLAY_NAMES"]) == len(self.tracker_list)
+        ):
+            self.tracker_to_disp = dict(
+                zip(self.tracker_list, self.config["TRACKER_DISPLAY_NAMES"])
+            )
+        else:
+            raise TrackEvalException(
+                "List of tracker files and tracker display names do not match."
+            )
+        # counter for globally unique track IDs
+        self.global_tid_counter = 0
+        self.tracker_data = dict()
+        if self.config["TRACKER_JSON_OBJECT"] is not None:
+            # allow directly specifying the tracker JSON data without reading from files
+            tracker = self.tracker_list[0]
+            self.tracker_data[tracker] = tracker_json
+        else:
+            for tracker in self.tracker_list:
+                tracker_dir_path = os.path.join(
+                    self.tracker_fol, tracker, self.tracker_sub_fol
+                )
+                tr_dir_files = [
+                    file
+                    for file in os.listdir(tracker_dir_path)
+                    if file.endswith(".json")
+                ]
+                if len(tr_dir_files) != 1:
+                    raise TrackEvalException(
+                        tracker_dir_path + " does not contain exactly one json file."
+                    )
+                with open(os.path.join(tracker_dir_path, tr_dir_files[0])) as f:
+                    curr_data = json.load(f)
+                self.tracker_data[tracker] = curr_data
+    def get_display_name(self, tracker):
+        return self.tracker_to_disp[tracker]
+    def _load_raw_file(self, tracker, seq, is_gt):
+        """Load a file (gt or tracker) in the YouTubeVIS format
+        If is_gt, this returns a dict which contains the fields:
+        [gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets]: list (for each timestep) of lists of detections.
+        [classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_iscrowd]: dictionary with class values
+                                as keys and lists (for each track) as values
+        if not is_gt, this returns a dict which contains the fields:
+        [tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
+        [tracker_dets]: list (for each timestep) of lists of detections.
+        [classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_dt_track_ids, classes_to_dt_track_areas]: dictionary with class values as keys and lists as values
+        [classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
+        """
+        # select sequence tracks
+        seq_id = self.seq_name_to_seq_id[seq]
+        if is_gt:
+            tracks = [
+                ann for ann in self.gt_data["annotations"] if ann["video_id"] == seq_id
+            ]
+        else:
+            tracks = self._get_tracker_seq_tracks(tracker, seq_id)
+        # Convert data to required format
+        num_timesteps = self.seq_lengths[seq_id]
+        data_keys = ["ids", "classes", "dets"]
+        if not is_gt:
+            data_keys += ["tracker_confidences"]
+        raw_data = {key: [None] * num_timesteps for key in data_keys}
+        result_key = "segmentations" if self.iou_type == "segm" else "bboxes"
+        for t in range(num_timesteps):
+            raw_data["dets"][t] = [
+                track[result_key][t] for track in tracks if track[result_key][t]
+            ]
+            raw_data["ids"][t] = np.atleast_1d(
+                [track["id"] for track in tracks if track[result_key][t]]
+            ).astype(int)
+            raw_data["classes"][t] = np.atleast_1d(
+                [track["category_id"] for track in tracks if track[result_key][t]]
+            ).astype(int)
+            if not is_gt:
+                raw_data["tracker_confidences"][t] = np.atleast_1d(
+                    [track["score"] for track in tracks if track[result_key][t]]
+                ).astype(float)
+        if is_gt:
+            key_map = {"ids": "gt_ids", "classes": "gt_classes", "dets": "gt_dets"}
+        else:
+            key_map = {
+                "ids": "tracker_ids",
+                "classes": "tracker_classes",
+                "dets": "tracker_dets",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+        all_cls_ids = {self.class_name_to_class_id[cls] for cls in self.class_list}
+        classes_to_tracks = {
+            cls: [track for track in tracks if track["category_id"] == cls]
+            for cls in all_cls_ids
+        }
+        # mapping from classes to track representations and track information
+        raw_data["classes_to_tracks"] = {
+            cls: [
+                {i: track[result_key][i] for i in range(len(track[result_key]))}
+                for track in tracks
+            ]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_ids"] = {
+            cls: [track["id"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        raw_data["classes_to_track_areas"] = {
+            cls: [track["area"] for track in tracks]
+            for cls, tracks in classes_to_tracks.items()
+        }
+        if is_gt:
+            raw_data["classes_to_gt_track_iscrowd"] = {
+                cls: [track["iscrowd"] for track in tracks]
+                for cls, tracks in classes_to_tracks.items()
+            }
+        else:
+            raw_data["classes_to_dt_track_scores"] = {
+                cls: np.array([track["score"] for track in tracks])
+                for cls, tracks in classes_to_tracks.items()
+            }
+        if is_gt:
+            key_map = {
+                "classes_to_tracks": "classes_to_gt_tracks",
+                "classes_to_track_ids": "classes_to_gt_track_ids",
+                "classes_to_track_areas": "classes_to_gt_track_areas",
+            }
+        else:
+            key_map = {
+                "classes_to_tracks": "classes_to_dt_tracks",
+                "classes_to_track_ids": "classes_to_dt_track_ids",
+                "classes_to_track_areas": "classes_to_dt_track_areas",
+            }
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+        raw_data["num_timesteps"] = num_timesteps
+        raw_data["seq"] = seq
+        return raw_data
+    @_timing.time
+    def get_preprocessed_seq_data(self, raw_data, cls):
+        """Preprocess data for a single sequence for a single class ready for evaluation.
+        Inputs:
+             - raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
+             - cls is the class to be evaluated.
+        Outputs:
+             - data is a dict containing all of the information that metrics need to perform evaluation.
+                It contains the following fields:
+                    [num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
+                    [gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
+                    [gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
+                    [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        Notes:
+            General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
+                1) Extract only detections relevant for the class to be evaluated (including distractor detections).
+                2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
+                    distractor class, or otherwise marked as to be removed.
+                3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
+                    other criteria (e.g. are too small).
+                4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
+            After the above preprocessing steps, this function also calculates the number of gt and tracker detections
+                and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
+                unique within each timestep.
+        YouTubeVIS:
+            In YouTubeVIS, the 4 preproc steps are as follow:
+                1) There are 40 classes which are evaluated separately.
+                2) No matched tracker dets are removed.
+                3) No unmatched tracker dets are removed.
+                4) No gt dets are removed.
+            Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
+            and the tracks from the tracker data are sorted according to the tracker confidence.
+        """
+        cls_id = self.class_name_to_class_id[cls]
+        data_keys = [
+            "gt_ids",
+            "tracker_ids",
+            "gt_dets",
+            "tracker_dets",
+            "similarity_scores",
+        ]
+        data = {key: [None] * raw_data["num_timesteps"] for key in data_keys}
+        unique_gt_ids = []
+        unique_tracker_ids = []
+        num_gt_dets = 0
+        num_tracker_dets = 0
+        for t in range(raw_data["num_timesteps"]):
+            # Only extract relevant dets for this class for eval (cls)
+            gt_class_mask = np.atleast_1d(raw_data["gt_classes"][t] == cls_id)
+            gt_class_mask = gt_class_mask.astype(bool)
+            gt_ids = raw_data["gt_ids"][t][gt_class_mask]
+            gt_dets = [
+                raw_data["gt_dets"][t][ind]
+                for ind in range(len(gt_class_mask))
+                if gt_class_mask[ind]
+            ]
+            tracker_class_mask = np.atleast_1d(raw_data["tracker_classes"][t] == cls_id)
+            tracker_class_mask = tracker_class_mask.astype(bool)
+            tracker_ids = raw_data["tracker_ids"][t][tracker_class_mask]
+            tracker_dets = [
+                raw_data["tracker_dets"][t][ind]
+                for ind in range(len(tracker_class_mask))
+                if tracker_class_mask[ind]
+            ]
+            similarity_scores = raw_data["similarity_scores"][t][gt_class_mask, :][
+                :, tracker_class_mask
+            ]
+            data["tracker_ids"][t] = tracker_ids
+            data["tracker_dets"][t] = tracker_dets
+            data["gt_ids"][t] = gt_ids
+            data["gt_dets"][t] = gt_dets
+            data["similarity_scores"][t] = similarity_scores
+            unique_gt_ids += list(np.unique(data["gt_ids"][t]))
+            unique_tracker_ids += list(np.unique(data["tracker_ids"][t]))
+            num_tracker_dets += len(data["tracker_ids"][t])
+            num_gt_dets += len(data["gt_ids"][t])
+        # Re-label IDs such that there are no empty IDs
+        if len(unique_gt_ids) > 0:
+            unique_gt_ids = np.unique(unique_gt_ids)
+            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
+            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["gt_ids"][t]) > 0:
+                    data["gt_ids"][t] = gt_id_map[data["gt_ids"][t]].astype(int)
+        if len(unique_tracker_ids) > 0:
+            unique_tracker_ids = np.unique(unique_tracker_ids)
+            tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
+            tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
+            for t in range(raw_data["num_timesteps"]):
+                if len(data["tracker_ids"][t]) > 0:
+                    data["tracker_ids"][t] = tracker_id_map[
+                        data["tracker_ids"][t]
+                    ].astype(int)
+        # Ensure that ids are unique per timestep.
+        self._check_unique_ids(data)
+        # Record overview statistics.
+        data["num_tracker_dets"] = num_tracker_dets
+        data["num_gt_dets"] = num_gt_dets
+        data["num_tracker_ids"] = len(unique_tracker_ids)
+        data["num_gt_ids"] = len(unique_gt_ids)
+        data["num_timesteps"] = raw_data["num_timesteps"]
+        data["seq"] = raw_data["seq"]
+        # get track representations
+        data["gt_tracks"] = raw_data["classes_to_gt_tracks"][cls_id]
+        data["gt_track_ids"] = raw_data["classes_to_gt_track_ids"][cls_id]
+        data["gt_track_areas"] = raw_data["classes_to_gt_track_areas"][cls_id]
+        data["gt_track_iscrowd"] = raw_data["classes_to_gt_track_iscrowd"][cls_id]
+        data["dt_tracks"] = raw_data["classes_to_dt_tracks"][cls_id]
+        data["dt_track_ids"] = raw_data["classes_to_dt_track_ids"][cls_id]
+        data["dt_track_areas"] = raw_data["classes_to_dt_track_areas"][cls_id]
+        data["dt_track_scores"] = raw_data["classes_to_dt_track_scores"][cls_id]
+        data["iou_type"] = "mask"
+        # sort tracker data tracks by tracker confidence scores
+        if data["dt_tracks"]:
+            idx = np.argsort(
+                [-score for score in data["dt_track_scores"]], kind="mergesort"
+            )
+            data["dt_track_scores"] = [data["dt_track_scores"][i] for i in idx]
+            data["dt_tracks"] = [data["dt_tracks"][i] for i in idx]
+            data["dt_track_ids"] = [data["dt_track_ids"][i] for i in idx]
+            data["dt_track_areas"] = [data["dt_track_areas"][i] for i in idx]
+        return data
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
+        if self.iou_type == "segm":
+            similarity_scores = self._calculate_mask_ious(
+                gt_dets_t, tracker_dets_t, is_encoded=True, do_ioa=False
+            )
+        else:
+            gt_dets_t = np.array(gt_dets_t, dtype=np.float32).reshape(-1, 4)
+            tracker_dets_t = np.array(tracker_dets_t, dtype=np.float32).reshape(-1, 4)
+            similarity_scores = self._calculate_box_ious(
+                gt_dets_t, tracker_dets_t, box_format="xywh", do_ioa=False
+            )
+        return similarity_scores
+    def _prepare_gt_annotations(self):
+        """
+        Prepares GT data by rle encoding segmentations and computing the average track area.
+        :return: None
+        """
+        if self.iou_type == "segm":
+            # only loaded when needed to reduce minimum requirements
+            from pycocotools import mask as mask_utils
+            for track in self.gt_data["annotations"]:
+                h = track["height"]
+                w = track["width"]
+                for i, seg in enumerate(track["segmentations"]):
+                    if seg is not None and isinstance(seg["counts"], list):
+                        track["segmentations"][i] = mask_utils.frPyObjects(seg, h, w)
+                areas = [a for a in track["areas"] if a]
+                if len(areas) == 0:
+                    track["area"] = 0
+                else:
+                    track["area"] = np.array(areas).mean()
+        else:
+            for track in self.gt_data["annotations"]:
+                # For bbox eval, compute areas from bboxes if not already available
+                areas = [a for a in track.get("areas", []) if a]
+                if not areas:
+                    areas = []
+                    for bbox in track.get("bboxes", []):
+                        if bbox is not None:
+                            areas.append(bbox[2] * bbox[3])
+                track["area"] = np.array(areas).mean() if areas else 0
+    def _get_tracker_seq_tracks(self, tracker, seq_id):
+        """
+        Prepares tracker data for a given sequence. Extracts all annotations for given sequence ID, computes
+        average track area and assigns a track ID.
+        :param tracker: the given tracker
+        :param seq_id: the sequence ID
+        :return: the extracted tracks
+        """
+        # only loaded when needed to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+        tracks = [
+            ann for ann in self.tracker_data[tracker] if ann["video_id"] == seq_id
+        ]
+        for track in tracks:
+            if "areas" not in track:
+                if self.iou_type == "segm":
+                    for seg in track["segmentations"]:
+                        if seg:
+                            track["areas"].append(mask_utils.area(seg))
+                        else:
+                            track["areas"].append(None)
+                else:
+                    for bbox in track["bboxes"]:
+                        if bbox:
+                            track["areas"].append(bbox[2] * bbox[3])
+                        else:
+                            track["areas"].append(None)
+            areas = [a for a in track["areas"] if a]
+            if len(areas) == 0:
+                track["area"] = 0
+            else:
+                track["area"] = np.array(areas).mean()
+            track["id"] = self.global_tid_counter
+            self.global_tid_counter += 1
+        return tracks
+    def get_name(self):
+        return self.dataset_name

sam3/eval/hota_eval_toolkit/trackeval/eval.py ADDED Viewed

	@@ -0,0 +1,395 @@

+# flake8: noqa
+import os
+import time
+import traceback
+from functools import partial
+from multiprocessing.pool import Pool
+import numpy as np
+from . import _timing, utils
+from .metrics import Count
+from .utils import TrackEvalException
+try:
+    import tqdm
+    TQDM_IMPORTED = True
+except ImportError as _:
+    TQDM_IMPORTED = False
+class Evaluator:
+    """Evaluator class for evaluating different metrics for different datasets"""
+    @staticmethod
+    def get_default_eval_config():
+        """Returns the default config values for evaluation"""
+        code_path = utils.get_code_path()
+        default_config = {
+            "USE_PARALLEL": False,
+            "NUM_PARALLEL_CORES": 8,
+            "BREAK_ON_ERROR": True,  # Raises exception and exits with error
+            "RETURN_ON_ERROR": False,  # if not BREAK_ON_ERROR, then returns from function on error
+            "LOG_ON_ERROR": os.path.join(
+                code_path, "error_log.txt"
+            ),  # if not None, save any errors into a log file.
+            "PRINT_RESULTS": True,
+            "PRINT_ONLY_COMBINED": False,
+            "PRINT_CONFIG": True,
+            "TIME_PROGRESS": True,
+            "DISPLAY_LESS_PROGRESS": True,
+            "OUTPUT_SUMMARY": True,
+            "OUTPUT_EMPTY_CLASSES": True,  # If False, summary files are not output for classes with no detections
+            "OUTPUT_DETAILED": True,
+            "PLOT_CURVES": True,
+        }
+        return default_config
+    def __init__(self, config=None):
+        """Initialise the evaluator with a config file"""
+        self.config = utils.init_config(config, self.get_default_eval_config(), "Eval")
+        # Only run timing analysis if not run in parallel.
+        if self.config["TIME_PROGRESS"] and not self.config["USE_PARALLEL"]:
+            _timing.DO_TIMING = True
+            if self.config["DISPLAY_LESS_PROGRESS"]:
+                _timing.DISPLAY_LESS_PROGRESS = True
+    def _combine_results(
+        self,
+        res,
+        metrics_list,
+        metric_names,
+        dataset,
+        res_field="COMBINED_SEQ",
+        target_tag=None,
+    ):
+        assert res_field.startswith("COMBINED_SEQ")
+        # collecting combined cls keys (cls averaged, det averaged, super classes)
+        tracker_list, seq_list, class_list = dataset.get_eval_info()
+        combined_cls_keys = []
+        res[res_field] = {}
+        # narrow the target for evaluation
+        if target_tag is not None:
+            target_video_ids = [
+                annot["video_id"]
+                for annot in dataset.gt_data["annotations"]
+                if target_tag in annot["tags"]
+            ]
+            vid2name = {
+                video["id"]: video["file_names"][0].split("/")[0]
+                for video in dataset.gt_data["videos"]
+            }
+            target_video_ids = set(target_video_ids)
+            target_video = [vid2name[video_id] for video_id in target_video_ids]
+            if len(target_video) == 0:
+                raise TrackEvalException(
+                    "No sequences found with the tag %s" % target_tag
+                )
+            target_annotations = [
+                annot
+                for annot in dataset.gt_data["annotations"]
+                if annot["video_id"] in target_video_ids
+            ]
+            assert all(target_tag in annot["tags"] for annot in target_annotations), (
+                f"Not all annotations in the target sequences have the target tag {target_tag}. "
+                "We currently only support a target tag at the sequence level, not at the annotation level."
+            )
+        else:
+            target_video = seq_list
+        # combine sequences for each class
+        for c_cls in class_list:
+            res[res_field][c_cls] = {}
+            for metric, metric_name in zip(metrics_list, metric_names):
+                curr_res = {
+                    seq_key: seq_value[c_cls][metric_name]
+                    for seq_key, seq_value in res.items()
+                    if not seq_key.startswith("COMBINED_SEQ")
+                    and seq_key in target_video
+                }
+                res[res_field][c_cls][metric_name] = metric.combine_sequences(curr_res)
+        # combine classes
+        if dataset.should_classes_combine:
+            combined_cls_keys += [
+                "cls_comb_cls_av",
+                "cls_comb_det_av",
+                "all",
+            ]
+            res[res_field]["cls_comb_cls_av"] = {}
+            res[res_field]["cls_comb_det_av"] = {}
+            for metric, metric_name in zip(metrics_list, metric_names):
+                cls_res = {
+                    cls_key: cls_value[metric_name]
+                    for cls_key, cls_value in res[res_field].items()
+                    if cls_key not in combined_cls_keys
+                }
+                res[res_field]["cls_comb_cls_av"][metric_name] = (
+                    metric.combine_classes_class_averaged(cls_res)
+                )
+                res[res_field]["cls_comb_det_av"][metric_name] = (
+                    metric.combine_classes_det_averaged(cls_res)
+                )
+        # combine classes to super classes
+        if dataset.use_super_categories:
+            for cat, sub_cats in dataset.super_categories.items():
+                combined_cls_keys.append(cat)
+                res[res_field][cat] = {}
+                for metric, metric_name in zip(metrics_list, metric_names):
+                    cat_res = {
+                        cls_key: cls_value[metric_name]
+                        for cls_key, cls_value in res[res_field].items()
+                        if cls_key in sub_cats
+                    }
+                    res[res_field][cat][metric_name] = (
+                        metric.combine_classes_det_averaged(cat_res)
+                    )
+        return res, combined_cls_keys
+    def _summarize_results(
+        self,
+        res,
+        tracker,
+        metrics_list,
+        metric_names,
+        dataset,
+        res_field,
+        combined_cls_keys,
+    ):
+        config = self.config
+        output_fol = dataset.get_output_fol(tracker)
+        tracker_display_name = dataset.get_display_name(tracker)
+        for c_cls in res[
+            res_field
+        ].keys():  # class_list + combined classes if calculated
+            summaries = []
+            details = []
+            num_dets = res[res_field][c_cls]["Count"]["Dets"]
+            if config["OUTPUT_EMPTY_CLASSES"] or num_dets > 0:
+                for metric, metric_name in zip(metrics_list, metric_names):
+                    # for combined classes there is no per sequence evaluation
+                    if c_cls in combined_cls_keys:
+                        table_res = {res_field: res[res_field][c_cls][metric_name]}
+                    else:
+                        table_res = {
+                            seq_key: seq_value[c_cls][metric_name]
+                            for seq_key, seq_value in res.items()
+                        }
+                    if config["PRINT_RESULTS"] and config["PRINT_ONLY_COMBINED"]:
+                        dont_print = (
+                            dataset.should_classes_combine
+                            and c_cls not in combined_cls_keys
+                        )
+                        if not dont_print:
+                            metric.print_table(
+                                {res_field: table_res[res_field]},
+                                tracker_display_name,
+                                c_cls,
+                                res_field,
+                                res_field,
+                            )
+                    elif config["PRINT_RESULTS"]:
+                        metric.print_table(
+                            table_res, tracker_display_name, c_cls, res_field, res_field
+                        )
+                    if config["OUTPUT_SUMMARY"]:
+                        summaries.append(metric.summary_results(table_res))
+                    if config["OUTPUT_DETAILED"]:
+                        details.append(metric.detailed_results(table_res))
+                    if config["PLOT_CURVES"]:
+                        metric.plot_single_tracker_results(
+                            table_res,
+                            tracker_display_name,
+                            c_cls,
+                            output_fol,
+                        )
+                if config["OUTPUT_SUMMARY"]:
+                    utils.write_summary_results(summaries, c_cls, output_fol)
+                if config["OUTPUT_DETAILED"]:
+                    utils.write_detailed_results(details, c_cls, output_fol)
+    @_timing.time
+    def evaluate(self, dataset_list, metrics_list, show_progressbar=False):
+        """Evaluate a set of metrics on a set of datasets"""
+        config = self.config
+        metrics_list = metrics_list + [Count()]  # Count metrics are always run
+        metric_names = utils.validate_metrics_list(metrics_list)
+        dataset_names = [dataset.get_name() for dataset in dataset_list]
+        output_res = {}
+        output_msg = {}
+        for dataset, dataset_name in zip(dataset_list, dataset_names):
+            # Get dataset info about what to evaluate
+            output_res[dataset_name] = {}
+            output_msg[dataset_name] = {}
+            tracker_list, seq_list, class_list = dataset.get_eval_info()
+            print(
+                "\nEvaluating %i tracker(s) on %i sequence(s) for %i class(es) on %s dataset using the following "
+                "metrics: %s\n"
+                % (
+                    len(tracker_list),
+                    len(seq_list),
+                    len(class_list),
+                    dataset_name,
+                    ", ".join(metric_names),
+                )
+            )
+            # Evaluate each tracker
+            for tracker in tracker_list:
+                # if not config['BREAK_ON_ERROR'] then go to next tracker without breaking
+                try:
+                    # Evaluate each sequence in parallel or in series.
+                    # returns a nested dict (res), indexed like: res[seq][class][metric_name][sub_metric field]
+                    # e.g. res[seq_0001][pedestrian][hota][DetA]
+                    print("\nEvaluating %s\n" % tracker)
+                    time_start = time.time()
+                    if config["USE_PARALLEL"]:
+                        if show_progressbar and TQDM_IMPORTED:
+                            seq_list_sorted = sorted(seq_list)
+                            with Pool(config["NUM_PARALLEL_CORES"]) as pool, tqdm.tqdm(
+                                total=len(seq_list)
+                            ) as pbar:
+                                _eval_sequence = partial(
+                                    eval_sequence,
+                                    dataset=dataset,
+                                    tracker=tracker,
+                                    class_list=class_list,
+                                    metrics_list=metrics_list,
+                                    metric_names=metric_names,
+                                )
+                                results = []
+                                for r in pool.imap(
+                                    _eval_sequence, seq_list_sorted, chunksize=20
+                                ):
+                                    results.append(r)
+                                    pbar.update()
+                                res = dict(zip(seq_list_sorted, results))
+                        else:
+                            with Pool(config["NUM_PARALLEL_CORES"]) as pool:
+                                _eval_sequence = partial(
+                                    eval_sequence,
+                                    dataset=dataset,
+                                    tracker=tracker,
+                                    class_list=class_list,
+                                    metrics_list=metrics_list,
+                                    metric_names=metric_names,
+                                )
+                                results = pool.map(_eval_sequence, seq_list)
+                                res = dict(zip(seq_list, results))
+                    else:
+                        res = {}
+                        if show_progressbar and TQDM_IMPORTED:
+                            seq_list_sorted = sorted(seq_list)
+                            for curr_seq in tqdm.tqdm(seq_list_sorted):
+                                res[curr_seq] = eval_sequence(
+                                    curr_seq,
+                                    dataset,
+                                    tracker,
+                                    class_list,
+                                    metrics_list,
+                                    metric_names,
+                                )
+                        else:
+                            for curr_seq in sorted(seq_list):
+                                res[curr_seq] = eval_sequence(
+                                    curr_seq,
+                                    dataset,
+                                    tracker,
+                                    class_list,
+                                    metrics_list,
+                                    metric_names,
+                                )
+                    # Combine results over all sequences and then over all classes
+                    res, combined_cls_keys = self._combine_results(
+                        res, metrics_list, metric_names, dataset, "COMBINED_SEQ"
+                    )
+                    if np.all(
+                        ["tags" in annot for annot in dataset.gt_data["annotations"]]
+                    ):
+                        # Combine results over the challenging sequences and then over all classes
+                        # currently only support "tracking_challenging_pair"
+                        res, _ = self._combine_results(
+                            res,
+                            metrics_list,
+                            metric_names,
+                            dataset,
+                            "COMBINED_SEQ_CHALLENGING",
+                            "tracking_challenging_pair",
+                        )
+                    # Print and output results in various formats
+                    if config["TIME_PROGRESS"]:
+                        print(
+                            "\nAll sequences for %s finished in %.2f seconds"
+                            % (tracker, time.time() - time_start)
+                        )
+                    self._summarize_results(
+                        res,
+                        tracker,
+                        metrics_list,
+                        metric_names,
+                        dataset,
+                        "COMBINED_SEQ",
+                        combined_cls_keys,
+                    )
+                    if "COMBINED_SEQ_CHALLENGING" in res:
+                        self._summarize_results(
+                            res,
+                            tracker,
+                            metrics_list,
+                            metric_names,
+                            dataset,
+                            "COMBINED_SEQ_CHALLENGING",
+                            combined_cls_keys,
+                        )
+                    # Output for returning from function
+                    output_res[dataset_name][tracker] = res
+                    output_msg[dataset_name][tracker] = "Success"
+                except Exception as err:
+                    output_res[dataset_name][tracker] = None
+                    if type(err) == TrackEvalException:
+                        output_msg[dataset_name][tracker] = str(err)
+                    else:
+                        output_msg[dataset_name][tracker] = "Unknown error occurred."
+                    print("Tracker %s was unable to be evaluated." % tracker)
+                    print(err)
+                    traceback.print_exc()
+                    if config["LOG_ON_ERROR"] is not None:
+                        with open(config["LOG_ON_ERROR"], "a") as f:
+                            print(dataset_name, file=f)
+                            print(tracker, file=f)
+                            print(traceback.format_exc(), file=f)
+                            print("\n\n\n", file=f)
+                    if config["BREAK_ON_ERROR"]:
+                        raise err
+                    elif config["RETURN_ON_ERROR"]:
+                        return output_res, output_msg
+        return output_res, output_msg
+@_timing.time
+def eval_sequence(seq, dataset, tracker, class_list, metrics_list, metric_names):
+    """Function for evaluating a single sequence"""
+    raw_data = dataset.get_raw_seq_data(tracker, seq)
+    seq_res = {}
+    for cls in class_list:
+        seq_res[cls] = {}
+        data = dataset.get_preprocessed_seq_data(raw_data, cls)
+        for metric, met_name in zip(metrics_list, metric_names):
+            seq_res[cls][met_name] = metric.eval_sequence(data)
+    return seq_res

sam3/eval/hota_eval_toolkit/trackeval/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# flake8: noqa
+from .count import Count
+from .hota import HOTA

sam3/eval/hota_eval_toolkit/trackeval/metrics/_base_metric.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# flake8: noqa
+from abc import ABC, abstractmethod
+import numpy as np
+from .. import _timing
+from ..utils import TrackEvalException
+class _BaseMetric(ABC):
+    @abstractmethod
+    def __init__(self):
+        self.plottable = False
+        self.integer_fields = []
+        self.float_fields = []
+        self.array_labels = []
+        self.integer_array_fields = []
+        self.float_array_fields = []
+        self.fields = []
+        self.summary_fields = []
+        self.registered = False
+    #####################################################################
+    # Abstract functions for subclasses to implement
+    @_timing.time
+    @abstractmethod
+    def eval_sequence(self, data): ...
+    @abstractmethod
+    def combine_sequences(self, all_res): ...
+    @abstractmethod
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False): ...
+    @abstractmethod
+    def combine_classes_det_averaged(self, all_res): ...
+    def plot_single_tracker_results(self, all_res, tracker, output_folder, cls):
+        """Plot results of metrics, only valid for metrics with self.plottable"""
+        if self.plottable:
+            raise NotImplementedError(
+                "plot_results is not implemented for metric %s" % self.get_name()
+            )
+        else:
+            pass
+    #####################################################################
+    # Helper functions which are useful for all metrics:
+    @classmethod
+    def get_name(cls):
+        return cls.__name__
+    @staticmethod
+    def _combine_sum(all_res, field):
+        """Combine sequence results via sum"""
+        return sum([all_res[k][field] for k in all_res.keys()])
+    @staticmethod
+    def _combine_weighted_av(all_res, field, comb_res, weight_field):
+        """Combine sequence results via weighted average"""
+        return sum(
+            [all_res[k][field] * all_res[k][weight_field] for k in all_res.keys()]
+        ) / np.maximum(1.0, comb_res[weight_field])
+    def print_table(
+        self, table_res, tracker, cls, res_field="COMBINED_SEQ", output_lable="COMBINED"
+    ):
+        """Prints table of results for all sequences"""
+        print("")
+        metric_name = self.get_name()
+        self._row_print(
+            [metric_name + ": " + tracker + "-" + cls] + self.summary_fields
+        )
+        for seq, results in sorted(table_res.items()):
+            if seq.startswith("COMBINED_SEQ"):
+                continue
+            summary_res = self._summary_row(results)
+            self._row_print([seq] + summary_res)
+        summary_res = self._summary_row(table_res[res_field])
+        self._row_print([output_lable] + summary_res)
+    def _summary_row(self, results_):
+        vals = []
+        for h in self.summary_fields:
+            if h in self.float_array_fields:
+                vals.append("{0:1.5g}".format(100 * np.mean(results_[h])))
+            elif h in self.float_fields:
+                vals.append("{0:1.5g}".format(100 * float(results_[h])))
+            elif h in self.integer_fields:
+                vals.append("{0:d}".format(int(results_[h])))
+            else:
+                raise NotImplementedError(
+                    "Summary function not implemented for this field type."
+                )
+        return vals
+    @staticmethod
+    def _row_print(*argv):
+        """Prints results in an evenly spaced rows, with more space in first row"""
+        if len(argv) == 1:
+            argv = argv[0]
+        to_print = "%-35s" % argv[0]
+        for v in argv[1:]:
+            to_print += "%-10s" % str(v)
+        print(to_print)
+    def summary_results(self, table_res):
+        """Returns a simple summary of final results for a tracker"""
+        return dict(
+            zip(self.summary_fields, self._summary_row(table_res["COMBINED_SEQ"]))
+        )
+    def detailed_results(self, table_res):
+        """Returns detailed final results for a tracker"""
+        # Get detailed field information
+        detailed_fields = self.float_fields + self.integer_fields
+        for h in self.float_array_fields + self.integer_array_fields:
+            for alpha in [int(100 * x) for x in self.array_labels]:
+                detailed_fields.append(h + "___" + str(alpha))
+            detailed_fields.append(h + "___AUC")
+        # Get detailed results
+        detailed_results = {}
+        for seq, res in table_res.items():
+            detailed_row = self._detailed_row(res)
+            if len(detailed_row) != len(detailed_fields):
+                raise TrackEvalException(
+                    "Field names and data have different sizes (%i and %i)"
+                    % (len(detailed_row), len(detailed_fields))
+                )
+            detailed_results[seq] = dict(zip(detailed_fields, detailed_row))
+        return detailed_results
+    def _detailed_row(self, res):
+        detailed_row = []
+        for h in self.float_fields + self.integer_fields:
+            detailed_row.append(res[h])
+        for h in self.float_array_fields + self.integer_array_fields:
+            for i, alpha in enumerate([int(100 * x) for x in self.array_labels]):
+                detailed_row.append(res[h][i])
+            detailed_row.append(np.mean(res[h]))
+        return detailed_row

sam3/eval/hota_eval_toolkit/trackeval/metrics/count.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# flake8: noqa
+from .. import _timing
+from ._base_metric import _BaseMetric
+class Count(_BaseMetric):
+    """Class which simply counts the number of tracker and gt detections and ids."""
+    def __init__(self, config=None):
+        super().__init__()
+        self.integer_fields = ["Dets", "GT_Dets", "IDs", "GT_IDs"]
+        self.fields = self.integer_fields
+        self.summary_fields = self.fields
+    @_timing.time
+    def eval_sequence(self, data):
+        """Returns counts for one sequence"""
+        # Get results
+        res = {
+            "Dets": data["num_tracker_dets"],
+            "GT_Dets": data["num_gt_dets"],
+            "IDs": data["num_tracker_ids"],
+            "GT_IDs": data["num_gt_ids"],
+            "Frames": data["num_timesteps"],
+        }
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=None):
+        """Combines metrics across all classes by averaging over the class values"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        return res

sam3/eval/hota_eval_toolkit/trackeval/metrics/hota.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# flake8: noqa
+import os
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+from .. import _timing
+from ._base_metric import _BaseMetric
+class HOTA(_BaseMetric):
+    """Class which implements the HOTA metrics.
+    See: https://link.springer.com/article/10.1007/s11263-020-01375-2
+    """
+    def __init__(self, config=None):
+        super().__init__()
+        self.plottable = True
+        self.array_labels = np.arange(0.05, 0.99, 0.05)
+        self.integer_array_fields = ["HOTA_TP", "HOTA_FN", "HOTA_FP"]
+        self.float_array_fields = [
+            "HOTA",
+            "DetA",
+            "AssA",
+            "DetRe",
+            "DetPr",
+            "AssRe",
+            "AssPr",
+            "LocA",
+            "OWTA",
+        ]
+        self.float_fields = ["HOTA(0)", "LocA(0)", "HOTALocA(0)"]
+        self.fields = (
+            self.float_array_fields + self.integer_array_fields + self.float_fields
+        )
+        self.summary_fields = self.float_array_fields + self.float_fields
+    @_timing.time
+    def eval_sequence(self, data):
+        """Calculates the HOTA metrics for one sequence"""
+        # Initialise results
+        res = {}
+        for field in self.float_array_fields + self.integer_array_fields:
+            res[field] = np.zeros((len(self.array_labels)), dtype=float)
+        for field in self.float_fields:
+            res[field] = 0
+        # Return result quickly if tracker or gt sequence is empty
+        if data["num_tracker_dets"] == 0:
+            res["HOTA_FN"] = data["num_gt_dets"] * np.ones(
+                (len(self.array_labels)), dtype=float
+            )
+            res["LocA"] = np.ones((len(self.array_labels)), dtype=float)
+            res["LocA(0)"] = 1.0
+            return res
+        if data["num_gt_dets"] == 0:
+            res["HOTA_FP"] = data["num_tracker_dets"] * np.ones(
+                (len(self.array_labels)), dtype=float
+            )
+            res["LocA"] = np.ones((len(self.array_labels)), dtype=float)
+            res["LocA(0)"] = 1.0
+            return res
+        # Variables counting global association
+        potential_matches_count = np.zeros(
+            (data["num_gt_ids"], data["num_tracker_ids"])
+        )
+        gt_id_count = np.zeros((data["num_gt_ids"], 1))
+        tracker_id_count = np.zeros((1, data["num_tracker_ids"]))
+        # First loop through each timestep and accumulate global track information.
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(
+            zip(data["gt_ids"], data["tracker_ids"])
+        ):
+            # Count the potential matches between ids in each timestep
+            # These are normalised, weighted by the match similarity.
+            similarity = data["similarity_scores"][t]
+            sim_iou_denom = (
+                similarity.sum(0)[np.newaxis, :]
+                + similarity.sum(1)[:, np.newaxis]
+                - similarity
+            )
+            sim_iou = np.zeros_like(similarity)
+            sim_iou_mask = sim_iou_denom > 0 + np.finfo("float").eps
+            sim_iou[sim_iou_mask] = (
+                similarity[sim_iou_mask] / sim_iou_denom[sim_iou_mask]
+            )
+            potential_matches_count[
+                gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]
+            ] += sim_iou
+            # Calculate the total number of dets for each gt_id and tracker_id.
+            gt_id_count[gt_ids_t] += 1
+            tracker_id_count[0, tracker_ids_t] += 1
+        # Calculate overall jaccard alignment score (before unique matching) between IDs
+        global_alignment_score = potential_matches_count / (
+            gt_id_count + tracker_id_count - potential_matches_count
+        )
+        matches_counts = [
+            np.zeros_like(potential_matches_count) for _ in self.array_labels
+        ]
+        # Calculate scores for each timestep
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(
+            zip(data["gt_ids"], data["tracker_ids"])
+        ):
+            # Deal with the case that there are no gt_det/tracker_det in a timestep.
+            if len(gt_ids_t) == 0:
+                for a, alpha in enumerate(self.array_labels):
+                    res["HOTA_FP"][a] += len(tracker_ids_t)
+                continue
+            if len(tracker_ids_t) == 0:
+                for a, alpha in enumerate(self.array_labels):
+                    res["HOTA_FN"][a] += len(gt_ids_t)
+                continue
+            # Get matching scores between pairs of dets for optimizing HOTA
+            similarity = data["similarity_scores"][t]
+            score_mat = (
+                global_alignment_score[
+                    gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]
+                ]
+                * similarity
+            )
+            # Hungarian algorithm to find best matches
+            match_rows, match_cols = linear_sum_assignment(-score_mat)
+            # Calculate and accumulate basic statistics
+            for a, alpha in enumerate(self.array_labels):
+                actually_matched_mask = (
+                    similarity[match_rows, match_cols] >= alpha - np.finfo("float").eps
+                )
+                alpha_match_rows = match_rows[actually_matched_mask]
+                alpha_match_cols = match_cols[actually_matched_mask]
+                num_matches = len(alpha_match_rows)
+                res["HOTA_TP"][a] += num_matches
+                res["HOTA_FN"][a] += len(gt_ids_t) - num_matches
+                res["HOTA_FP"][a] += len(tracker_ids_t) - num_matches
+                if num_matches > 0:
+                    res["LocA"][a] += sum(
+                        similarity[alpha_match_rows, alpha_match_cols]
+                    )
+                    matches_counts[a][
+                        gt_ids_t[alpha_match_rows], tracker_ids_t[alpha_match_cols]
+                    ] += 1
+        # Calculate association scores (AssA, AssRe, AssPr) for the alpha value.
+        # First calculate scores per gt_id/tracker_id combo and then average over the number of detections.
+        for a, alpha in enumerate(self.array_labels):
+            matches_count = matches_counts[a]
+            ass_a = matches_count / np.maximum(
+                1, gt_id_count + tracker_id_count - matches_count
+            )
+            res["AssA"][a] = np.sum(matches_count * ass_a) / np.maximum(
+                1, res["HOTA_TP"][a]
+            )
+            ass_re = matches_count / np.maximum(1, gt_id_count)
+            res["AssRe"][a] = np.sum(matches_count * ass_re) / np.maximum(
+                1, res["HOTA_TP"][a]
+            )
+            ass_pr = matches_count / np.maximum(1, tracker_id_count)
+            res["AssPr"][a] = np.sum(matches_count * ass_pr) / np.maximum(
+                1, res["HOTA_TP"][a]
+            )
+        # Calculate final scores
+        res["LocA"] = np.maximum(1e-10, res["LocA"]) / np.maximum(1e-10, res["HOTA_TP"])
+        res = self._compute_final_fields(res)
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for field in self.integer_array_fields:
+            res[field] = self._combine_sum(all_res, field)
+        for field in ["AssRe", "AssPr", "AssA"]:
+            res[field] = self._combine_weighted_av(
+                all_res, field, res, weight_field="HOTA_TP"
+            )
+        loca_weighted_sum = sum(
+            [all_res[k]["LocA"] * all_res[k]["HOTA_TP"] for k in all_res.keys()]
+        )
+        res["LocA"] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(
+            1e-10, res["HOTA_TP"]
+        )
+        res = self._compute_final_fields(res)
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
+        """Combines metrics across all classes by averaging over the class values.
+        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
+        """
+        res = {}
+        for field in self.integer_array_fields:
+            if ignore_empty_classes:
+                res[field] = self._combine_sum(
+                    {
+                        k: v
+                        for k, v in all_res.items()
+                        if (
+                            v["HOTA_TP"] + v["HOTA_FN"] + v["HOTA_FP"]
+                            > 0 + np.finfo("float").eps
+                        ).any()
+                    },
+                    field,
+                )
+            else:
+                res[field] = self._combine_sum(
+                    {k: v for k, v in all_res.items()}, field
+                )
+        for field in self.float_fields + self.float_array_fields:
+            if ignore_empty_classes:
+                res[field] = np.mean(
+                    [
+                        v[field]
+                        for v in all_res.values()
+                        if (
+                            v["HOTA_TP"] + v["HOTA_FN"] + v["HOTA_FP"]
+                            > 0 + np.finfo("float").eps
+                        ).any()
+                    ],
+                    axis=0,
+                )
+            else:
+                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.integer_array_fields:
+            res[field] = self._combine_sum(all_res, field)
+        for field in ["AssRe", "AssPr", "AssA"]:
+            res[field] = self._combine_weighted_av(
+                all_res, field, res, weight_field="HOTA_TP"
+            )
+        loca_weighted_sum = sum(
+            [all_res[k]["LocA"] * all_res[k]["HOTA_TP"] for k in all_res.keys()]
+        )
+        res["LocA"] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(
+            1e-10, res["HOTA_TP"]
+        )
+        res = self._compute_final_fields(res)
+        return res
+    @staticmethod
+    def _compute_final_fields(res):
+        """Calculate sub-metric ('field') values which only depend on other sub-metric values.
+        This function is used both for both per-sequence calculation, and in combining values across sequences.
+        """
+        res["DetRe"] = res["HOTA_TP"] / np.maximum(1, res["HOTA_TP"] + res["HOTA_FN"])
+        res["DetPr"] = res["HOTA_TP"] / np.maximum(1, res["HOTA_TP"] + res["HOTA_FP"])
+        res["DetA"] = res["HOTA_TP"] / np.maximum(
+            1, res["HOTA_TP"] + res["HOTA_FN"] + res["HOTA_FP"]
+        )
+        res["HOTA"] = np.sqrt(res["DetA"] * res["AssA"])
+        res["OWTA"] = np.sqrt(res["DetRe"] * res["AssA"])
+        res["HOTA(0)"] = res["HOTA"][0]
+        res["LocA(0)"] = res["LocA"][0]
+        res["HOTALocA(0)"] = res["HOTA(0)"] * res["LocA(0)"]
+        return res
+    def plot_single_tracker_results(self, table_res, tracker, cls, output_folder):
+        """Create plot of results"""
+        # Only loaded when run to reduce minimum requirements
+        from matplotlib import pyplot as plt
+        res = table_res["COMBINED_SEQ"]
+        styles_to_plot = ["r", "b", "g", "b--", "b:", "g--", "g:", "m"]
+        for name, style in zip(self.float_array_fields, styles_to_plot):
+            plt.plot(self.array_labels, res[name], style)
+        plt.xlabel("alpha")
+        plt.ylabel("score")
+        plt.title(tracker + " - " + cls)
+        plt.axis([0, 1, 0, 1])
+        legend = []
+        for name in self.float_array_fields:
+            legend += [name + " (" + str(np.round(np.mean(res[name]), 2)) + ")"]
+        plt.legend(legend, loc="lower left")
+        out_file = os.path.join(output_folder, cls + "_plot.pdf")
+        os.makedirs(os.path.dirname(out_file), exist_ok=True)
+        plt.savefig(out_file)
+        plt.savefig(out_file.replace(".pdf", ".png"))
+        plt.clf()

sam3/eval/hota_eval_toolkit/trackeval/utils.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# flake8: noqa
+import argparse
+import csv
+import os
+from collections import OrderedDict
+def init_config(config, default_config, name=None):
+    """Initialise non-given config values with defaults"""
+    if config is None:
+        config = default_config
+    else:
+        for k in default_config.keys():
+            if k not in config.keys():
+                config[k] = default_config[k]
+    if name and config["PRINT_CONFIG"]:
+        print("\n%s Config:" % name)
+        for c in config.keys():
+            print("%-20s : %-30s" % (c, config[c]))
+    return config
+def update_config(config):
+    """
+    Parse the arguments of a script and updates the config values for a given value if specified in the arguments.
+    :param config: the config to update
+    :return: the updated config
+    """
+    parser = argparse.ArgumentParser()
+    for setting in config.keys():
+        if type(config[setting]) == list or type(config[setting]) == type(None):
+            parser.add_argument("--" + setting, nargs="+")
+        else:
+            parser.add_argument("--" + setting)
+    args = parser.parse_args().__dict__
+    for setting in args.keys():
+        if args[setting] is not None:
+            if type(config[setting]) == type(True):
+                if args[setting] == "True":
+                    x = True
+                elif args[setting] == "False":
+                    x = False
+                else:
+                    raise Exception(
+                        "Command line parameter " + setting + "must be True or False"
+                    )
+            elif type(config[setting]) == type(1):
+                x = int(args[setting])
+            elif type(args[setting]) == type(None):
+                x = None
+            else:
+                x = args[setting]
+            config[setting] = x
+    return config
+def get_code_path():
+    """Get base path where code is"""
+    return os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+def validate_metrics_list(metrics_list):
+    """Get names of metric class and ensures they are unique, further checks that the fields within each metric class
+    do not have overlapping names.
+    """
+    metric_names = [metric.get_name() for metric in metrics_list]
+    # check metric names are unique
+    if len(metric_names) != len(set(metric_names)):
+        raise TrackEvalException(
+            "Code being run with multiple metrics of the same name"
+        )
+    fields = []
+    for m in metrics_list:
+        fields += m.fields
+    # check metric fields are unique
+    if len(fields) != len(set(fields)):
+        raise TrackEvalException(
+            "Code being run with multiple metrics with fields of the same name"
+        )
+    return metric_names
+def write_summary_results(summaries, cls, output_folder):
+    """Write summary results to file"""
+    fields = sum([list(s.keys()) for s in summaries], [])
+    values = sum([list(s.values()) for s in summaries], [])
+    # In order to remain consistent upon new fields being adding, for each of the following fields if they are present
+    # they will be output in the summary first in the order below. Any further fields will be output in the order each
+    # metric family is called, and within each family either in the order they were added to the dict (python >= 3.6) or
+    # randomly (python < 3.6).
+    default_order = [
+        "HOTA",
+        "DetA",
+        "AssA",
+        "DetRe",
+        "DetPr",
+        "AssRe",
+        "AssPr",
+        "LocA",
+        "OWTA",
+        "HOTA(0)",
+        "LocA(0)",
+        "HOTALocA(0)",
+        "MOTA",
+        "MOTP",
+        "MODA",
+        "CLR_Re",
+        "CLR_Pr",
+        "MTR",
+        "PTR",
+        "MLR",
+        "CLR_TP",
+        "CLR_FN",
+        "CLR_FP",
+        "IDSW",
+        "MT",
+        "PT",
+        "ML",
+        "Frag",
+        "sMOTA",
+        "IDF1",
+        "IDR",
+        "IDP",
+        "IDTP",
+        "IDFN",
+        "IDFP",
+        "Dets",
+        "GT_Dets",
+        "IDs",
+        "GT_IDs",
+    ]
+    default_ordered_dict = OrderedDict(
+        zip(default_order, [None for _ in default_order])
+    )
+    for f, v in zip(fields, values):
+        default_ordered_dict[f] = v
+    for df in default_order:
+        if default_ordered_dict[df] is None:
+            del default_ordered_dict[df]
+    fields = list(default_ordered_dict.keys())
+    values = list(default_ordered_dict.values())
+    out_file = os.path.join(output_folder, cls + "_summary.txt")
+    os.makedirs(os.path.dirname(out_file), exist_ok=True)
+    with open(out_file, "w", newline="") as f:
+        writer = csv.writer(f, delimiter=" ")
+        writer.writerow(fields)
+        writer.writerow(values)
+def write_detailed_results(details, cls, output_folder):
+    """Write detailed results to file"""
+    sequences = details[0].keys()
+    fields = ["seq"] + sum([list(s["COMBINED_SEQ"].keys()) for s in details], [])
+    out_file = os.path.join(output_folder, cls + "_detailed.csv")
+    os.makedirs(os.path.dirname(out_file), exist_ok=True)
+    with open(out_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(fields)
+        for seq in sorted(sequences):
+            if seq == "COMBINED_SEQ":
+                continue
+            writer.writerow([seq] + sum([list(s[seq].values()) for s in details], []))
+        writer.writerow(
+            ["COMBINED"] + sum([list(s["COMBINED_SEQ"].values()) for s in details], [])
+        )
+def load_detail(file):
+    """Loads detailed data for a tracker."""
+    data = {}
+    with open(file) as f:
+        for i, row_text in enumerate(f):
+            row = row_text.replace("\r", "").replace("\n", "").split(",")
+            if i == 0:
+                keys = row[1:]
+                continue
+            current_values = row[1:]
+            seq = row[0]
+            if seq == "COMBINED":
+                seq = "COMBINED_SEQ"
+            if (len(current_values) == len(keys)) and seq != "":
+                data[seq] = {}
+                for key, value in zip(keys, current_values):
+                    data[seq][key] = float(value)
+    return data
+class TrackEvalException(Exception):
+    """Custom exception for catching expected errors."""
+    ...

sam3/eval/postprocessors.py ADDED Viewed

	@@ -0,0 +1,648 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+"""Postprocessors class to transform MDETR output according to the downstream task"""
+import dataclasses
+import logging
+from collections import defaultdict
+from typing import Dict, List, Optional
+import numpy as np
+import torch
+from sam3.model import box_ops
+from sam3.model.data_misc import BatchedInferenceMetadata, interpolate
+from sam3.train.masks_ops import rle_encode, robust_rle_encode
+from torch import nn
+class PostProcessNullOp(nn.Module):
+    def __init__(self, **kwargs):
+        super(PostProcessNullOp).__init__()
+        pass
+    def forward(self, input):
+        pass
+    def process_results(self, **kwargs):
+        return kwargs["find_stages"]
+class PostProcessImage(nn.Module):
+    """This module converts the model's output into the format expected by the coco api"""
+    def __init__(
+        self,
+        max_dets_per_img: int,
+        iou_type="bbox",
+        to_cpu: bool = True,
+        use_original_ids: bool = False,
+        use_original_sizes_box: bool = False,
+        use_original_sizes_mask: bool = False,
+        convert_mask_to_rle: bool = False,
+        always_interpolate_masks_on_gpu: bool = True,
+        use_presence: bool = True,
+        detection_threshold: float = -1.0,
+    ) -> None:
+        super().__init__()
+        self.max_dets_per_img = max_dets_per_img
+        self.iou_type = iou_type
+        self.to_cpu = to_cpu
+        self.convert_mask_to_rle = convert_mask_to_rle
+        self.always_interpolate_masks_on_gpu = always_interpolate_masks_on_gpu
+        self.use_presence = use_presence
+        self.detection_threshold = detection_threshold
+        self.use_original_ids = use_original_ids
+        self.use_original_sizes_box = use_original_sizes_box
+        self.use_original_sizes_mask = use_original_sizes_mask
+    @torch.no_grad()
+    def forward(
+        self,
+        outputs,
+        target_sizes_boxes,
+        target_sizes_masks,
+        forced_labels=None,
+        consistent=False,
+        ret_tensordict: bool = False,  # This is experimental
+    ):
+        """Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes_boxes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+            target_sizes_masks: same but used to resize masks
+            forced_labels: tensor of dimension [batch_size] containing the label to force for each image of the batch
+                           This is useful when evaluating the model using standard metrics (eg on COCO, LVIS). In that case,
+                           we query the model with every possible class label, so we when we pass the predictions to the evaluator,
+                           we want to make sure that the predicted "class" matches the one that was queried.
+            consistent: whether all target sizes are equal
+            ret_tensordict: Experimental argument. If true, return a tensordict.TensorDict instead of a list of dictionaries for easier manipulation.
+        """
+        if ret_tensordict:
+            assert (
+                consistent is True
+            ), "We don't support returning TensorDict if the outputs have different shapes"  # NOTE: It's possible but we don't support it.
+            assert self.detection_threshold <= 0.0, "TODO: implement?"
+            try:
+                from tensordict import TensorDict
+            except ImportError:
+                logging.info(
+                    "tensordict is not installed. Install by running `pip install tensordict --no-deps`. Falling back by setting `ret_tensordict=False`"
+                )
+                ret_tensordict = False
+        out_bbox = outputs["pred_boxes"] if "pred_boxes" in outputs else None
+        out_logits = outputs["pred_logits"]
+        pred_masks = outputs["pred_masks"] if self.iou_type == "segm" else None
+        out_probs = out_logits.sigmoid()
+        if self.use_presence:
+            presence_score = outputs["presence_logit_dec"].sigmoid().unsqueeze(1)
+            out_probs = out_probs * presence_score
+        assert target_sizes_boxes.shape[1] == 2
+        assert target_sizes_masks.shape[1] == 2
+        batch_size = target_sizes_boxes.shape[0]
+        boxes, scores, labels, keep = self._process_boxes_and_labels(
+            target_sizes_boxes, forced_labels, out_bbox, out_probs
+        )
+        assert boxes is None or len(boxes) == batch_size
+        out_masks = self._process_masks(
+            target_sizes_masks, pred_masks, consistent=consistent, keep=keep
+        )
+        del pred_masks
+        if boxes is None:
+            assert out_masks is not None
+            assert not ret_tensordict, "We don't support returning TensorDict if the output does not contain boxes"
+            B = len(out_masks)
+            boxes = [None] * B
+            scores = [None] * B
+            labels = [None] * B
+        results = {
+            "scores": scores,
+            "labels": labels,
+            "boxes": boxes,
+        }
+        if out_masks is not None:
+            if self.convert_mask_to_rle:
+                results.update(masks_rle=out_masks)
+            else:
+                results.update(masks=out_masks)
+        if ret_tensordict:
+            results = TensorDict(results).auto_batch_size_()
+            if self.to_cpu:
+                results = results.cpu()
+        else:
+            # Convert a dictonary of lists/tensors to list of dictionaries
+            results = [
+                dict(zip(results.keys(), res_tuple))
+                for res_tuple in zip(*results.values())
+            ]
+        return results
+    def _process_masks(self, target_sizes, pred_masks, consistent=True, keep=None):
+        if pred_masks is None:
+            return None
+        if self.always_interpolate_masks_on_gpu:
+            gpu_device = target_sizes.device
+            assert gpu_device.type == "cuda"
+            pred_masks = pred_masks.to(device=gpu_device)
+        if consistent:
+            assert keep is None, "TODO: implement?"
+            # All masks should have the same shape, expected when processing a batch of size 1
+            target_size = target_sizes.unique(dim=0)
+            assert target_size.size(0) == 1, "Expecting all target sizes to be equal"
+            out_masks = (
+                interpolate(
+                    pred_masks,
+                    target_size.squeeze().tolist(),
+                    mode="bilinear",
+                    align_corners=False,
+                ).sigmoid()
+                > 0.5
+            )
+            if self.convert_mask_to_rle:
+                raise RuntimeError("TODO: implement?")
+            if self.to_cpu:
+                out_masks = out_masks.cpu()
+        else:
+            out_masks = [[]] * len(pred_masks)
+            assert keep is None or len(keep) == len(pred_masks)
+            for i, mask in enumerate(pred_masks):
+                h, w = target_sizes[i]
+                if keep is not None:
+                    mask = mask[keep[i]]
+                # Uses the gpu version fist, moves masks to cpu if it fails"""
+                try:
+                    interpolated = (
+                        interpolate(
+                            mask.unsqueeze(1),
+                            (h, w),
+                            mode="bilinear",
+                            align_corners=False,
+                        ).sigmoid()
+                        > 0.5
+                    )
+                except Exception as e:
+                    logging.info("Issue found, reverting to CPU mode!")
+                    mask_device = mask.device
+                    mask = mask.cpu()
+                    interpolated = (
+                        interpolate(
+                            mask.unsqueeze(1),
+                            (h, w),
+                            mode="bilinear",
+                            align_corners=False,
+                        ).sigmoid()
+                        > 0.5
+                    )
+                    interpolated = interpolated.to(mask_device)
+                if self.convert_mask_to_rle:
+                    out_masks[i] = robust_rle_encode(interpolated.squeeze(1))
+                else:
+                    out_masks[i] = interpolated
+                    if self.to_cpu:
+                        out_masks[i] = out_masks[i].cpu()
+        return out_masks
+    def _process_boxes_and_labels(
+        self, target_sizes, forced_labels, out_bbox, out_probs
+    ):
+        if out_bbox is None:
+            return None, None, None, None
+        assert len(out_probs) == len(target_sizes)
+        if self.to_cpu:
+            out_probs = out_probs.cpu()
+        scores, labels = out_probs.max(-1)
+        if forced_labels is None:
+            labels = torch.ones_like(labels)
+        else:
+            labels = forced_labels[:, None].expand_as(labels)
+        # convert to [x0, y0, x1, y1] format
+        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+        if self.to_cpu:
+            boxes = boxes.cpu()
+        keep = None
+        if self.detection_threshold > 0:
+            # Filter out the boxes with scores below the detection threshold
+            keep = scores > self.detection_threshold
+            assert len(keep) == len(boxes) == len(scores) == len(labels)
+            boxes = [b[k.to(b.device)] for b, k in zip(boxes, keep)]
+            scores = [s[k.to(s.device)] for s, k in zip(scores, keep)]
+            labels = [l[k.to(l.device)] for l, k in zip(labels, keep)]
+        return boxes, scores, labels, keep
+    def process_results(
+        self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
+    ):
+        if find_stages.loss_stages is not None:
+            find_metadatas = [find_metadatas[i] for i in find_stages.loss_stages]
+        assert len(find_stages) == len(find_metadatas)
+        results = {}
+        for outputs, meta in zip(find_stages, find_metadatas):
+            img_size_for_boxes = (
+                meta.original_size
+                if self.use_original_sizes_box
+                else torch.ones_like(meta.original_size)
+            )
+            img_size_for_masks = (
+                meta.original_size
+                if self.use_original_sizes_mask
+                else torch.ones_like(meta.original_size)
+            )
+            detection_results = self(
+                outputs,
+                img_size_for_boxes,
+                img_size_for_masks,
+                forced_labels=(
+                    meta.original_category_id if self.use_original_ids else None
+                ),
+            )
+            ids = (
+                meta.original_image_id if self.use_original_ids else meta.coco_image_id
+            )
+            assert len(detection_results) == len(ids)
+            for img_id, result in zip(ids, detection_results):
+                if img_id.item() not in results:
+                    results[img_id.item()] = result
+                else:
+                    assert set(results[img_id.item()].keys()) == set(result.keys())
+                    for k in result.keys():
+                        if isinstance(result[k], torch.Tensor):
+                            results[img_id.item()][k] = torch.cat(
+                                [results[img_id.item()][k], result[k]], dim=0
+                            )
+                        elif isinstance(result[k], list):
+                            results[img_id.item()][k] += result[k]
+                        else:
+                            raise NotImplementedError(
+                                f"Unexpected type {type(result[k])} in result."
+                            )
+        # Prune the results to the max number of detections per image.
+        for img_id, result in results.items():
+            if (
+                self.max_dets_per_img > 0
+                and len(result["scores"]) > self.max_dets_per_img
+            ):
+                _, topk_indexes = torch.topk(
+                    result["scores"], self.max_dets_per_img, dim=0
+                )
+                if self.to_cpu:
+                    topk_indexes = topk_indexes.cpu()
+                for k in result.keys():
+                    if isinstance(results[img_id][k], list):
+                        results[img_id][k] = [
+                            results[img_id][k][i] for i in topk_indexes.tolist()
+                        ]
+                    else:
+                        results[img_id][k] = results[img_id][k].to(topk_indexes.device)[
+                            topk_indexes
+                        ]
+        return results
+class PostProcessAPIVideo(PostProcessImage):
+    """This module converts the video model's output into the format expected by the YT-VIS api"""
+    def __init__(
+        self,
+        *args,
+        to_cpu: bool = True,
+        convert_mask_to_rle: bool = False,
+        always_interpolate_masks_on_gpu: bool = True,
+        prob_thresh: float = 0.5,
+        use_presence: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            *args,
+            # Here we always set `convert_mask_to_rle=False` in the base `PostProcessAPI` class
+            # (so that its `_process_masks` won't return a list of RLEs). If we want to return
+            # RLEs for video masklets, we handle it in this `PostProcessAPIVideo` class instead.
+            convert_mask_to_rle=False,
+            # Here we always set `to_cpu=False` in the base `PostProcessAPI` class (so that
+            # the interpolated masks won't be automatically moved back to CPU). We will handle
+            # it in this `PostProcessAPIVideo` class instead.
+            always_interpolate_masks_on_gpu=always_interpolate_masks_on_gpu,
+            use_presence=use_presence,
+            **kwargs,
+        )
+        # Expected keys in the output dict to postprocess
+        self.EXPECTED_KEYS = [
+            "pred_logits",
+            "pred_boxes",
+            "pred_masks",
+        ]
+        # Whether to post-process video masklets (under packed representation) into RLE format
+        self.convert_mask_to_rle_for_video = convert_mask_to_rle
+        self.to_cpu_for_video = to_cpu
+        self.prob_thresh = prob_thresh
+    def process_results(
+        self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
+    ):
+        """
+        Tracking Postprocessor for SAM 3 video model.
+        This function takes in the output of the SAM 3 video model and processes it to extract all the tracklet predictions.
+        Args:
+            find_stages: A list of tensors representing the output of the SAM 3 video model.
+            find_metadatas: A list of BatchedInferenceMetadata objects containing metadata about each frame.
+            **kwargs: Additional keyword arguments.
+        Returns:
+            A dictionary of predcitions with video_id as key.
+        """
+        # Import tensordict here to avoid global dependency.
+        try:
+            from tensordict import TensorDict
+        except ImportError as e:
+            logging.error(
+                "tensordict is not installed, please install by running `pip install tensordict --no-deps`"
+            )
+            raise e
+        # Notes and assumptions:
+        # 1- This postprocessor assumes results only for a single video.
+        # 2- There are N stage outputs corresponding to N video frames
+        # 3- Each stage outputs contains PxQ preds, where P is number of prompts and Q is number of object queries. The output should also contain the tracking object ids corresponding to each object query.
+        # 4- The tracking object id has a default value of -1, indicating that the object query is not tracking any object in the frame, and hence its predictions can be ingored for a given frame.
+        # 5- Some objects may be tracked in a subset of frames only. So, we first extract the predictions in a packed representation (for efficient postprocessing -- specially memory)
+        # and then we convert the packed representation into a padded one, where we zero pad boxes/masks for objects that are not tracked in some frames.
+        # 6- We refer to objects by an object id, which is a tuple (prompt_idx, obj_id)
+        assert len(find_stages) > 0, "There is nothing to postprocess?"
+        PROMPT_AXIS, OBJ_QUERY_AXIS = (0, 1)
+        NO_OBJ_ID = -1
+        # Maps object ID -> [indices in packed tensor]
+        tracked_objects_packed_idx = defaultdict(list)
+        # Maps object ID -> [indices in padded tensor (abs frame index)]
+        tracked_objects_frame_idx = defaultdict(list)
+        total_num_preds = 0
+        # This will hold the packed representation of predictions.
+        vid_preds_packed: List[TensorDict] = []
+        vid_masklets_rle_packed: List[Optional[Dict]] = []
+        video_id = -1  # We assume single video postprocessing, this ID should be unique in the datapoint.
+        for frame_idx, (frame_outs, meta) in enumerate(
+            zip(find_stages, find_metadatas)
+        ):
+            # only store keys we need to extract the results
+            frame_outs_td = TensorDict(
+                {k: frame_outs[k] for k in self.EXPECTED_KEYS}
+            ).auto_batch_size_()  # Shape is [P,Q,...]
+            meta_td = TensorDict(
+                dataclasses.asdict(meta)
+            ).auto_batch_size_()  # Shape is [P,...]
+            unique_vid_id = meta.original_image_id.unique()
+            assert unique_vid_id.size(0) == 1
+            if video_id == -1:
+                video_id = unique_vid_id.item()
+            else:
+                assert (
+                    video_id == unique_vid_id.item()
+                ), "We can only postprocess one video per datapoint"
+            # keeping track of which objects appear in the current frame
+            obj_ids_per_frame = frame_outs["pred_object_ids"]
+            assert obj_ids_per_frame.size(-1) == frame_outs["pred_logits"].size(-2)
+            if self.prob_thresh is not None:
+                # only keep the predictions on this frame with probability above the threshold
+                # (remove those predictions during the keep-alive period of a tracking query,
+                # where its "pred_object_ids" is still the tracked object ID rather than -1)
+                pred_probs = frame_outs["pred_logits"].sigmoid().squeeze(-1)
+                obj_ids_per_frame = torch.where(
+                    pred_probs >= self.prob_thresh, obj_ids_per_frame, NO_OBJ_ID
+                )
+            tracked_obj_ids_idx = torch.where(obj_ids_per_frame != NO_OBJ_ID)
+            # Object id is a tuple of (prompt_idx, obj_id). This is because the model can assign same obj_id for two different prompts.
+            tracked_obj_ids = [
+                (p_id.item(), obj_ids_per_frame[p_id, q_id].item())
+                for p_id, q_id in zip(
+                    tracked_obj_ids_idx[PROMPT_AXIS],
+                    tracked_obj_ids_idx[OBJ_QUERY_AXIS],
+                )
+            ]
+            if len(tracked_obj_ids) == 0:
+                continue
+            # For each object, we keep track of the packed and padded (frame index) indices
+            for oid in tracked_obj_ids:
+                tracked_objects_packed_idx[oid].append(total_num_preds)
+                tracked_objects_frame_idx[oid].append(frame_idx)
+                total_num_preds += 1
+            # Since we have P*Q masks per frame, mask interpolation is the GPU memory bottleneck or time bottleneck in case of cpu processing.
+            # Instead, we first extract results only for tracked objects, reducing the number of masks to K = sum_i(tracked_objs_per_ith_prompt), hopefully <<< P*Q
+            tracked_objs_outs_td = frame_outs_td[
+                tracked_obj_ids_idx
+            ]  # [P,Q,...] --> [K,...]
+            meta_td = meta_td[tracked_obj_ids_idx[PROMPT_AXIS].cpu()]
+            if self.always_interpolate_masks_on_gpu:
+                gpu_device = meta_td["original_size"].device
+                assert gpu_device.type == "cuda"
+                tracked_objs_outs_td = tracked_objs_outs_td.to(device=gpu_device)
+            frame_results_td = self(
+                tracked_objs_outs_td.unsqueeze(1),
+                (
+                    meta_td["original_size"]
+                    if self.use_original_sizes
+                    else torch.ones_like(meta_td["original_size"])
+                ),
+                forced_labels=(
+                    meta_td["original_category_id"] if self.use_original_ids else None
+                ),
+                consistent=True,
+                ret_tensordict=True,
+            ).squeeze(1)
+            del tracked_objs_outs_td
+            # Optionally, remove "masks" from output tensor dict and directly encode them
+            # to RLE format under packed representations
+            if self.convert_mask_to_rle_for_video:
+                interpolated_binary_masks = frame_results_td.pop("masks")
+                rle_list = rle_encode(interpolated_binary_masks, return_areas=True)
+                vid_masklets_rle_packed.extend(rle_list)
+            # Optionally, move output TensorDict to CPU (do this after RLE encoding step above)
+            if self.to_cpu_for_video:
+                frame_results_td = frame_results_td.cpu()
+            vid_preds_packed.append(frame_results_td)
+        if len(vid_preds_packed) == 0:
+            logging.debug(f"Video {video_id} has no predictions")
+            return {video_id: []}
+        vid_preds_packed = torch.cat(vid_preds_packed, dim=0)
+        ############### Construct a padded representation of the predictions ###############
+        num_preds = len(tracked_objects_packed_idx)
+        num_frames = len(find_stages)
+        # We zero pad any missing prediction
+        # NOTE: here, we also have padded tensors for "scores" and "labels", but we overwrite them later.
+        padded_frames_results = TensorDict(
+            {
+                k: torch.zeros(
+                    num_preds, num_frames, *v.shape[1:], device=v.device, dtype=v.dtype
+                )
+                for k, v in vid_preds_packed.items()
+            },
+            batch_size=[
+                num_preds,
+                num_frames,
+            ],
+        )
+        padded_frames_results["scores"][...] = -1e8  # a very low score for empty object
+        # Track scores and labels of each pred tracklet, only for frames where the model was able to track that object
+        tracklet_scores = []
+        tracklet_labels = []
+        # Optionally, fill the list of RLEs for masklets
+        # note: only frames with actual predicted masks (in packed format) will be
+        # filled with RLEs; the rest will remains None in results["masks_rle"]
+        if self.convert_mask_to_rle_for_video:
+            vid_masklets_rle_padded = [[None] * num_frames for _ in range(num_preds)]
+        for o_idx, oid in enumerate(tracked_objects_packed_idx):
+            oid2packed_idx = tracked_objects_packed_idx[oid]
+            oid2padded_idx = tracked_objects_frame_idx[oid]
+            obj_packed_results = vid_preds_packed[oid2packed_idx]
+            padded_frames_results[o_idx][oid2padded_idx] = obj_packed_results
+            if self.convert_mask_to_rle_for_video:
+                for packed_idx, padded_idx in zip(oid2packed_idx, oid2padded_idx):
+                    vid_masklets_rle_padded[o_idx][padded_idx] = (
+                        vid_masklets_rle_packed[packed_idx]
+                    )
+            # NOTE: We need a single confidence score per tracklet for the mAP metric.
+            # We use the average confidence score across time. (How does this impact AP?)
+            tracklet_scores.append(obj_packed_results["scores"].mean())
+            # We also need to have a unique category Id per tracklet.
+            # This is not a problem for phrase AP, however, for mAP we do majority voting across time.
+            tracklet_labels.append(obj_packed_results["labels"].mode()[0])
+        results = padded_frames_results.to_dict()
+        results["scores"] = torch.stack(tracklet_scores, dim=0)
+        results["labels"] = torch.stack(tracklet_labels, dim=0)
+        if self.convert_mask_to_rle_for_video:
+            results["masks_rle"] = vid_masklets_rle_padded
+        # we keep the frame-level scores since it's needed by some evaluation scripts
+        results["per_frame_scores"] = padded_frames_results["scores"]
+        return {video_id: results}
+class PostProcessTracking(PostProcessImage):
+    """This module converts the model's output into the format expected by the coco api"""
+    def __init__(
+        self,
+        max_dets_per_img: int,
+        iou_type="bbox",
+        force_single_mask: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(max_dets_per_img=max_dets_per_img, iou_type=iou_type, **kwargs)
+        self.force_single_mask = force_single_mask
+    def process_results(
+        self, find_stages, find_metadatas: BatchedInferenceMetadata, **kwargs
+    ):
+        assert len(find_stages) == len(find_metadatas)
+        results = {}
+        for outputs, meta in zip(find_stages, find_metadatas):
+            if self.force_single_mask:
+                scores, labels = outputs["pred_logits"].max(-1)
+                m = []
+                for i in range(len(outputs["pred_masks"])):
+                    score, idx = scores[i].max(0)
+                    m.append(outputs["pred_masks"][i][idx])
+                outputs["pred_masks"] = torch.stack(m, 0).unsqueeze(1)
+            detection_results = self(outputs, meta.original_size, consistent=False)
+            assert len(detection_results) == len(meta.coco_image_id)
+            results.update(
+                {
+                    (media_id.item(), object_id.item(), frame_index.item()): result
+                    for media_id, object_id, frame_index, result in zip(
+                        meta.original_image_id,
+                        meta.object_id,
+                        meta.frame_index,
+                        detection_results,
+                    )
+                }
+            )
+        return results
+class PostProcessCounting(nn.Module):
+    """This module converts the model's output to be evaluated for counting tasks"""
+    def __init__(
+        self,
+        use_original_ids: bool = False,
+        threshold: float = 0.5,
+        use_presence: bool = False,
+    ) -> None:
+        """
+        Args:
+            use_original_ids: whether to use the original image ids or the coco ids
+            threshold: threshold for counting (values above this are counted)
+        """
+        super().__init__()
+        self.use_original_ids = use_original_ids
+        self.threshold = threshold
+        self.use_presence = use_presence
+    def forward(self, outputs, target_sizes):
+        """Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+        """
+        # Extract scores from model outputs and apply sigmoid
+        scores = torch.sigmoid(outputs["pred_logits"]).squeeze(-1)  # [B, N]
+        if self.use_presence:
+            presence_score = outputs["presence_logit_dec"].sigmoid()
+            if presence_score.ndim == 1:
+                presence_score = presence_score.unsqueeze(1)  # [B, 1]
+            scores = scores * presence_score  # [B, N]
+        # Calculate counts by summing values above threshold
+        counts = (scores > self.threshold).float().sum(dim=1)
+        assert len(counts) == len(target_sizes)
+        results = []
+        for count in counts:
+            results.append({"count": count.item()})
+        return results
+    @torch.no_grad()
+    def process_results(
+        self, find_stages, find_metadatas: List[BatchedInferenceMetadata], **kwargs
+    ):
+        assert len(find_stages) == len(find_metadatas)
+        results = {}
+        for outputs, meta in zip(find_stages, find_metadatas):
+            detection_results = self(
+                outputs,
+                meta.original_size,
+            )
+            ids = (
+                meta.original_image_id if self.use_original_ids else meta.coco_image_id
+            )
+            assert len(detection_results) == len(ids)
+            for img_id, result in zip(ids, detection_results):
+                results[img_id.item()] = result
+        return results

sam3/eval/saco_veval_eval.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import argparse
+import json
+import os
+from collections import defaultdict
+from iopath.common.file_io import g_pathmgr
+from sam3.eval.saco_veval_evaluators import (
+    VideoCGF1Evaluator,
+    VideoPhraseApEvaluator,
+    VideoPhraseHotaEvaluator,
+    VideoTetaEvaluator,
+    YTVISPredFileEvaluator,
+)
+class VEvalEvaluator:
+    def __init__(self, gt_annot_file: str, eval_res_file: str):
+        self.gt_annot_file = gt_annot_file
+        self.eval_res_file = eval_res_file
+        self.evaluators = [
+            # mAP
+            YTVISPredFileEvaluator(gt_annot_file),
+            # Phrase AP
+            VideoPhraseApEvaluator(gt_annot_file),
+            # TETA
+            VideoTetaEvaluator(gt_annot_file, use_mask=True, is_exhaustive=True),
+            # HOTA
+            VideoPhraseHotaEvaluator(gt_annot_file),
+            # cgF1
+            VideoCGF1Evaluator(gt_annot_file),
+        ]
+    def run_eval(self, pred_file: str):
+        dataset_results = {}
+        video_np_results = defaultdict(dict)
+        for evaluator in self.evaluators:
+            d_res, v_np_res = evaluator.evaluate(pred_file)
+            dataset_results.update(d_res)
+            for (video_id, category_id), res in v_np_res.items():
+                video_np_results[(video_id, category_id)].update(res)
+        if len(dataset_results) == 0:
+            dataset_results = {"": 0.0}
+        formatted_video_np_results = [
+            {"video_id": video_id, "category_id": category_id, **res}
+            for (video_id, category_id), res in video_np_results.items()
+        ]
+        eval_metrics = {
+            "dataset_results": dataset_results,
+            "video_np_results": formatted_video_np_results,
+        }
+        with g_pathmgr.open(self.eval_res_file, "w") as f:
+            json.dump(eval_metrics, f)
+        return eval_metrics
+def run_main_all(dataset_name, args):
+    gt_annot_file = os.path.join(args.gt_annot_dir, dataset_name + ".json")
+    pred_file = os.path.join(args.pred_dir, dataset_name + "_preds.json")
+    eval_res_file = os.path.join(args.eval_res_dir, dataset_name + "_eval_res.json")
+    print(f"=== Running evaluation for Pred {pred_file} vs GT {gt_annot_file} ===")
+    veval_evaluator = VEvalEvaluator(
+        gt_annot_file=gt_annot_file, eval_res_file=eval_res_file
+    )
+    _ = veval_evaluator.run_eval(pred_file=pred_file)
+    print(f"=== Results saved to {eval_res_file} ===")
+def main_all(args):
+    saco_veval_dataset_names = [
+        "saco_veval_sav_test",
+        "saco_veval_sav_val",
+        "saco_veval_yt1b_test",
+        "saco_veval_yt1b_val",
+        "saco_veval_smartglasses_test",
+        "saco_veval_smartglasses_val",
+    ]
+    # multiprocessing may not really work as inner evaluator also using multiprocessing
+    # so we just for loop
+    for dataset_name in saco_veval_dataset_names:
+        print(f"=== Running evaluation for dataset {dataset_name} ===")
+        run_main_all(dataset_name=dataset_name, args=args)
+def main_one(args):
+    gt_annot_file = args.gt_annot_file
+    pred_file = args.pred_file
+    eval_res_file = args.eval_res_file
+    print(f"=== Running evaluation for Pred {pred_file} vs GT {gt_annot_file} ===")
+    veval_evaluator = VEvalEvaluator(
+        gt_annot_file=gt_annot_file, eval_res_file=eval_res_file
+    )
+    _ = veval_evaluator.run_eval(pred_file=pred_file)
+    print(f"=== Results saved to {eval_res_file} ===")
+def main():
+    parser = argparse.ArgumentParser(description="Run video grounding evaluators")
+    # Create subparsers for different commands
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    # Run evaluation for all datasets
+    all_parser = subparsers.add_parser("all", help="Run evaluation for all datasets")
+    all_parser.add_argument(
+        "--gt_annot_dir",
+        type=str,
+        help="Directory that contains the ground truth annotation files",
+    )
+    all_parser.add_argument(
+        "--pred_dir",
+        type=str,
+        help="Directory that contains the prediction files",
+    )
+    all_parser.add_argument(
+        "--eval_res_dir",
+        type=str,
+        help="Directory that contains the eval results files",
+    )
+    all_parser.set_defaults(func=main_all)
+    # Run evaluation for one dataset
+    one_parser = subparsers.add_parser("one", help="Run evaluation for one dataset")
+    one_parser.add_argument(
+        "--gt_annot_file",
+        type=str,
+        help="Path to the ground truth annotation file",
+    )
+    one_parser.add_argument(
+        "--pred_file",
+        type=str,
+        help="Path to the prediction file",
+    )
+    one_parser.add_argument(
+        "--eval_res_file",
+        type=str,
+        help="Path to the eval results file",
+    )
+    one_parser.set_defaults(func=main_one)
+    # Parse and dispatch
+    args = parser.parse_args()
+    args.func(args)
+if __name__ == "__main__":
+    main()

sam3/eval/saco_veval_evaluators.py ADDED Viewed

	@@ -0,0 +1,838 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+import json
+import os
+import tempfile
+from collections import defaultdict
+from typing import Dict, Optional, Sequence, Tuple
+import numpy as np
+import pycocotools.mask
+from sam3.eval.cgf1_eval import CGF1_METRICS
+from sam3.eval.conversion_util import (
+    convert_ytbvis_to_cocovid_gt,
+    convert_ytbvis_to_cocovid_pred,
+)
+from sam3.eval.hota_eval_toolkit.run_ytvis_eval import run_ytvis_eval
+from sam3.eval.teta_eval_toolkit import config, Evaluator, metrics
+from sam3.eval.teta_eval_toolkit.datasets import COCO, TAO
+from sam3.eval.ytvis_coco_wrapper import YTVIS
+from sam3.eval.ytvis_eval import VideoDemoF1Eval, YTVISeval
+from sam3.train.nms_helper import process_frame_level_nms, process_track_level_nms
+def _get_metric_index(metric_name: str, iou_threshold: Optional[float] = None) -> int:
+    """
+    Find the index of a metric in CGF1_METRICS by name and IoU threshold.
+    Args:
+        metric_name: Name of the metric (e.g., "cgF1", "precision", "recall")
+        iou_threshold: IoU threshold (None for average over 0.5:0.95, or specific value like 0.5, 0.75)
+    Returns:
+        Index of the metric in CGF1_METRICS
+    Raises:
+        ValueError: If metric not found
+    """
+    for idx, metric in enumerate(CGF1_METRICS):
+        if metric.name == metric_name and metric.iou_threshold == iou_threshold:
+            return idx
+    raise ValueError(
+        f"Metric '{metric_name}' with IoU threshold {iou_threshold} not found in CGF1_METRICS"
+    )
+class BasePredFileEvaluator:
+    """A base class for evaluating a prediction file."""
+    pass
+class YTVISPredFileEvaluator(BasePredFileEvaluator):
+    """Evaluate class mAP for YT-VIS prediction files."""
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        iou_types: Optional[Sequence[str]] = None,
+    ):
+        self.gt_ann_file = gt_ann_file
+        self.dataset_name = dataset_name
+        self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
+        assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
+    def evaluate(self, pred_file: str) -> Dict[str, float]:
+        # use our internal video evaluation toolkit for YT-VIS pred file
+        # (i.e. the same one we're using for video phrase AP)
+        results = {}
+        use_cats = True  # YT-VIS mAP evaluation uses categories
+        ytvisGT = YTVIS(self.gt_ann_file, ignore_gt_cats=not use_cats)
+        # the original YT-VIS GT annotations have uncompressed RLEs ("counts" is an integer list)
+        # rather than compressed RLEs ("counts" is a string), so we first convert them here.
+        if "segm" in self.iou_types:
+            for ann in ytvisGT.dataset["annotations"]:
+                ann["segmentations"] = [
+                    _compress_rle(rle) for rle in ann["segmentations"]
+                ]
+        with open(pred_file) as f:
+            dt = json.load(f)
+        # Our prediction file saves "video_id" and absolute (unnormalized) boxes.
+        # Note that we should use the official (original) YT-VIS annotations (i.e. the one
+        # saved via "scripts/datasets/training/ytvis_split.py", instead of the one saved
+        # via "scripts/api_db_to_ytvis_json.py") in this evaluator, which contain absolute
+        # boxes coordinates in its GT annotations.
+        for d in dt:
+            d["image_id"] = d["video_id"]
+        ytvisDT = ytvisGT.loadRes(dt)
+        for iou_type in self.iou_types:
+            ytvisEval = YTVISeval(ytvisGT, ytvisDT, iou_type)
+            # set the area ranges for small, medium, and large objects (using
+            # absolute pixel areas) as in the official YT-VIS evaluation toolkit:
+            # https://github.com/achalddave/ytvosapi/blob/eca601117c9f86bad084cb91f1d918e9ab665a75/PythonAPI/ytvostools/ytvoseval.py#L538
+            ytvisEval.params.areaRng = [
+                [0**2, 1e5**2],
+                [0**2, 128**2],
+                [128**2, 256**2],
+                [256**2, 1e5**2],
+            ]
+            ytvisEval.params.areaRngLbl = ["all", "small", "medium", "large"]
+            ytvisEval.params.useCats = use_cats
+            ytvisEval.evaluate()
+            ytvisEval.accumulate()
+            ytvisEval.summarize()
+            result_key = f"{self.dataset_name}_{'mask' if iou_type == 'segm' else 'bbox'}_mAP_50_95"
+            results[result_key] = ytvisEval.stats[0]
+        # video-NP level results not supported for `YTVISPredFileEvaluator` yet
+        video_np_level_results = {}
+        return results, video_np_level_results
+class VideoPhraseApEvaluator(BasePredFileEvaluator):
+    """Evaluate Video Phrase AP with YT-VIS format prediction and GT files."""
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        iou_types: Optional[Sequence[str]] = None,
+    ):
+        self.gt_ann_file = gt_ann_file
+        self.dataset_name = dataset_name
+        self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
+        assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
+    def evaluate(self, pred_file: str) -> Dict[str, float]:
+        with open(self.gt_ann_file) as f:
+            gt = json.load(f)
+        with open(pred_file) as f:
+            dt = json.load(f)
+        # For phrase AP and demo F1 evaluation, we need to remap each pair of (video_id, category_id) to
+        # a new unique video_id, so that we don't mix detections from different categories under `useCat=False`
+        gt, dt = remap_video_category_pairs_to_unique_video_ids(gt, dt)
+        if "segm" in self.iou_types:
+            for ann in gt["annotations"]:
+                ann["segmentations"] = [
+                    _compress_rle(rle) for rle in ann["segmentations"]
+                ]
+        for d in dt:
+            d["image_id"] = d["video_id"]
+        results = {}
+        use_cats = False  # Phrase AP evaluation does not use categories
+        ytvisGT = YTVIS(annotation_file=None, ignore_gt_cats=not use_cats)
+        ytvisGT.dataset = gt
+        ytvisGT.createIndex()
+        ytvisDT = ytvisGT.loadRes(dt)
+        for iou_type in self.iou_types:
+            phraseApEval = YTVISeval(ytvisGT, ytvisDT, iou_type)
+            # set the area ranges for small, medium, and large objects (using
+            # absolute pixel areas) as in the official YT-VIS evaluation toolkit:
+            # https://github.com/achalddave/ytvosapi/blob/eca601117c9f86bad084cb91f1d918e9ab665a75/PythonAPI/ytvostools/ytvoseval.py#L538
+            phraseApEval.params.areaRng = [
+                [0**2, 1e5**2],
+                [0**2, 128**2],
+                [128**2, 256**2],
+                [256**2, 1e5**2],
+            ]
+            phraseApEval.params.areaRngLbl = ["all", "small", "medium", "large"]
+            phraseApEval.params.useCats = use_cats
+            phraseApEval.evaluate()
+            phraseApEval.accumulate()
+            phraseApEval.summarize()
+            result_prefix = f"{self.dataset_name}"
+            result_prefix += f"_{'mask' if iou_type == 'segm' else 'bbox'}_phrase_ap"
+            # fetch Phrase AP results from the corresponding indices in `phraseApEval.stats`
+            # (see `_summarizeDets` in https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py)
+            results[result_prefix + "_50_95"] = phraseApEval.stats[0]  # IoU=0.5:0.95
+            results[result_prefix + "_50"] = phraseApEval.stats[1]  # IoU=0.5
+            results[result_prefix + "_75"] = phraseApEval.stats[2]  # IoU=0.75
+        # video-NP level results not supported for `VideoPhraseApEvaluator` yet
+        video_np_level_results = {}
+        return results, video_np_level_results
+class VideoCGF1Evaluator(BasePredFileEvaluator):
+    """Evaluate Video Demo F1 with YT-VIS format prediction and GT files."""
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        prob_thresh: float = 0.5,
+        iou_types: Optional[Sequence[str]] = None,
+    ):
+        self.gt_ann_file = gt_ann_file
+        self.dataset_name = dataset_name
+        self.prob_thresh = prob_thresh
+        self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
+        assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
+    def evaluate(self, pred_file: str) -> Dict[str, float]:
+        with open(self.gt_ann_file) as f:
+            gt = json.load(f)
+        with open(pred_file) as f:
+            dt = json.load(f)
+        # compute IL_MCC and CG-F1 can only be computed if we have "video_np_pairs" keys in the GT JSON
+        compute_ilmcc_and_cgf1 = "video_np_pairs" in gt
+        if not compute_ilmcc_and_cgf1:
+            print(
+                f"Warning: IL_MCC and CG-F1 are not computed for {pred_file=} as it does not have 'video_np_pairs' keys in the GT JSON"
+            )
+        # For phrase AP and demo F1 evaluation, we need to remap each pair of (video_id, category_id) to
+        # a new unique video_id, so that we don't mix detections from different categories under `useCat=False`
+        gt, dt = remap_video_category_pairs_to_unique_video_ids(
+            gt, dt, add_negative_np_pairs=compute_ilmcc_and_cgf1
+        )
+        if "segm" in self.iou_types:
+            for ann in gt["annotations"]:
+                ann["segmentations"] = [
+                    _compress_rle(rle) for rle in ann["segmentations"]
+                ]
+        for d in dt:
+            d["image_id"] = d["video_id"]
+        results = {}
+        use_cats = False  # Demo F1 evaluation does not use categories
+        ytvisGT = YTVIS(annotation_file=None, ignore_gt_cats=not use_cats)
+        ytvisGT.dataset = gt
+        ytvisGT.createIndex()
+        ytvisDT = ytvisGT.loadRes(dt)
+        video_np_level_results = {}
+        for iou_type in self.iou_types:
+            demoF1Eval = VideoDemoF1Eval(ytvisGT, ytvisDT, iou_type, self.prob_thresh)
+            demoF1Eval.params.useCats = use_cats
+            demoF1Eval.params.areaRng = [[0**2, 1e5**2]]
+            demoF1Eval.params.areaRngLbl = ["all"]
+            demoF1Eval.params.maxDets = [100000]
+            demoF1Eval.evaluate()
+            demoF1Eval.accumulate()
+            demoF1Eval.summarize()
+            result_prefix = f"{self.dataset_name}"
+            result_prefix += f"_{'mask' if iou_type == 'segm' else 'bbox'}_demo"
+            stats = demoF1Eval.stats
+            if compute_ilmcc_and_cgf1:
+                # Average IoU threshold (0.5:0.95)
+                cgf1_micro_avg_idx = _get_metric_index("cgF1", None)
+                positive_micro_f1_avg_idx = _get_metric_index("positive_micro_F1", None)
+                ilmcc_avg_idx = _get_metric_index("IL_MCC", None)
+                results[result_prefix + "_cgf1_micro_50_95"] = stats[cgf1_micro_avg_idx]
+                results[result_prefix + "_ilmcc_50_95"] = stats[ilmcc_avg_idx]
+                results[result_prefix + "_positive_micro_f1_50_95"] = stats[
+                    positive_micro_f1_avg_idx
+                ]
+                # IoU = 0.5
+                cgf1_micro_50_idx = _get_metric_index("cgF1", 0.5)
+                positive_micro_f1_50_idx = _get_metric_index("positive_micro_F1", 0.5)
+                results[result_prefix + "_cgf1_micro_50"] = stats[cgf1_micro_50_idx]
+                results[result_prefix + "_ilmcc_50"] = float(
+                    np.array(stats[cgf1_micro_50_idx])
+                    / np.array(stats[positive_micro_f1_50_idx])
+                )
+                results[result_prefix + "_positive_micro_f1_50"] = stats[
+                    positive_micro_f1_50_idx
+                ]
+                # IoU = 0.75
+                cgf1_micro_75_idx = _get_metric_index("cgF1", 0.75)
+                positive_micro_f1_75_idx = _get_metric_index("positive_micro_F1", 0.75)
+                results[result_prefix + "_cgf1_micro_75"] = stats[cgf1_micro_75_idx]
+                results[result_prefix + "_ilmcc_75"] = float(
+                    np.array(stats[cgf1_micro_75_idx])
+                    / np.array(stats[positive_micro_f1_75_idx])
+                )
+                results[result_prefix + "_positive_micro_f1_75"] = stats[
+                    positive_micro_f1_75_idx
+                ]
+            self.extract_video_np_level_results(demoF1Eval, video_np_level_results)
+        return results, video_np_level_results
+    def extract_video_np_level_results(self, demoF1Eval, video_np_level_results):
+        """Aggregate statistics for video-level metrics."""
+        num_iou_thrs = len(demoF1Eval.params.iouThrs)
+        iou_50_index = int(np.where(demoF1Eval.params.iouThrs == 0.5)[0])
+        iou_75_index = int(np.where(demoF1Eval.params.iouThrs == 0.75)[0])
+        result_prefix = "mask" if demoF1Eval.params.iouType == "segm" else "bbox"
+        assert len(demoF1Eval.evalImgs) == len(demoF1Eval.cocoGt.dataset["images"])
+        for i, video in enumerate(demoF1Eval.cocoGt.dataset["images"]):
+            # the original video id and category id before remapping
+            video_id = video["orig_video_id"]
+            category_id = video["orig_category_id"]
+            eval_img_dict = demoF1Eval.evalImgs[i]
+            TPs = eval_img_dict.get("TPs", np.zeros(num_iou_thrs, dtype=np.int64))
+            FPs = eval_img_dict.get("FPs", np.zeros(num_iou_thrs, dtype=np.int64))
+            FNs = eval_img_dict.get("FNs", np.zeros(num_iou_thrs, dtype=np.int64))
+            assert len(TPs) == len(FPs) == len(FNs) == num_iou_thrs
+            # F1 = 2*TP / (2*TP + FP + FN), and we set F1 to 1.0 if denominator is 0
+            denominator = 2 * TPs + FPs + FNs
+            F1s = np.where(denominator > 0, 2 * TPs / np.maximum(denominator, 1), 1.0)
+            local_results = {
+                f"{result_prefix}_TP_50_95": float(TPs.mean()),
+                f"{result_prefix}_FP_50_95": float(FPs.mean()),
+                f"{result_prefix}_FN_50_95": float(FNs.mean()),
+                f"{result_prefix}_F1_50_95": float(F1s.mean()),
+                f"{result_prefix}_TP_50": float(TPs[iou_50_index]),
+                f"{result_prefix}_FP_50": float(FPs[iou_50_index]),
+                f"{result_prefix}_FN_50": float(FNs[iou_50_index]),
+                f"{result_prefix}_F1_50": float(F1s[iou_50_index]),
+                f"{result_prefix}_TP_75": float(TPs[iou_75_index]),
+                f"{result_prefix}_FP_75": float(FPs[iou_75_index]),
+                f"{result_prefix}_FN_75": float(FNs[iou_75_index]),
+                f"{result_prefix}_F1_75": float(F1s[iou_75_index]),
+            }
+            if (video_id, category_id) not in video_np_level_results:
+                video_np_level_results[(video_id, category_id)] = {}
+            video_np_level_results[(video_id, category_id)].update(local_results)
+class VideoTetaEvaluator(BasePredFileEvaluator):
+    """Evaluate TETA metric using YouTubeVIS format prediction and GT files."""
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        tracker_name: str = "Sam3",
+        nms_threshold: float = 0.5,
+        nms_strategy: str = "none",  # "track", "frame", or "none"
+        prob_thresh: float = 0.5,
+        is_exhaustive: bool = False,
+        use_mask: bool = False,
+        num_parallel_cores: int = 8,
+    ):
+        self.gt_ann_file = gt_ann_file
+        self.dataset_name = dataset_name
+        self.tracker_name = tracker_name
+        self.nms_threshold = nms_threshold
+        self.nms_strategy = nms_strategy.lower()  # Convert to lowercase for consistency
+        self.prob_thresh = prob_thresh
+        self.metric_prefix = "TETA"
+        self.is_exhaustive = is_exhaustive
+        self.use_mask = use_mask
+        self.num_parallel_cores = num_parallel_cores
+        # Verify NMS strategy is valid
+        valid_strategies = ["track", "frame", "none"]
+        print("current nms_strategy:", self.nms_strategy)
+        if self.nms_strategy not in valid_strategies:
+            raise ValueError(
+                f"Invalid NMS strategy: {self.nms_strategy}. Must be one of {valid_strategies}"
+            )
+        print(f"Initialized VideoTetaEvaluator with NMS strategy: {self.nms_strategy}")
+        print(f"Probability threshold set to: {self.prob_thresh}")
+        print(f"Dataset exhaustivity set to: {self.is_exhaustive}")
+        print(f"Tracker name set to: {self.tracker_name}")
+        print(f"Dataset name set to: {self.dataset_name}")
+        print(f"Use mask set to: {self.use_mask}")
+    def process_predictions(self, pred_file: str, tmp_dir: str) -> str:
+        """Process predictions with selected NMS strategy"""
+        with open(pred_file, "r") as f:
+            raw_preds = json.load(f)
+        print(f"Processing predictions with {self.nms_strategy} NMS strategy")
+        # Filter by score threshold
+        if self.prob_thresh > 0:
+            raw_preds = [d for d in raw_preds if d["score"] >= self.prob_thresh]
+            print(
+                f"Filtered to {len(raw_preds)} predictions with score >= {self.prob_thresh}"
+            )
+        # Group predictions by video_id
+        video_groups = defaultdict(list)
+        for pred in raw_preds:
+            video_groups[pred["video_id"]].append(pred)
+        # Process based on NMS strategy
+        if self.nms_strategy == "track":
+            process_track_level_nms(video_groups, nms_threshold=self.nms_threshold)
+        elif self.nms_strategy == "frame":
+            process_frame_level_nms(video_groups, nms_threshold=self.nms_threshold)
+        elif self.nms_strategy == "none":
+            print("Skipping NMS processing as strategy is set to 'none'")
+            # No processing needed for "none" strategy
+        # Save processed predictions
+        processed_preds = [
+            track for tracks in video_groups.values() for track in tracks
+        ]
+        processed_path = os.path.join(tmp_dir, "processed_preds.json")
+        with open(processed_path, "w") as f:
+            json.dump(processed_preds, f)
+        print(f"Saved processed predictions to {processed_path}")
+        return processed_path
+    def evaluate(self, pred_file: str) -> Tuple[Dict[str, float], Dict]:
+        """Main evaluation method"""
+        print(f"Evaluating TETA Metric with {self.nms_strategy.upper()} NMS strategy")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # Process predictions first
+            processed_pred_file = self.process_predictions(pred_file, tmp_dir)
+            # Convert GT to COCO-vid format
+            gt_dir = os.path.join(tmp_dir, "gt")
+            os.makedirs(gt_dir, exist_ok=True)
+            gt_coco_path = os.path.join(gt_dir, "annotations.json")
+            convert_ytbvis_to_cocovid_gt(self.gt_ann_file, gt_coco_path)
+            # Convert processed predictions to COCO-vid format
+            pred_dir = os.path.join(tmp_dir, "predictions")
+            tracker_dir = os.path.join(pred_dir, self.tracker_name)
+            os.makedirs(tracker_dir, exist_ok=True)
+            pred_coco_path = os.path.join(tracker_dir, "track_results_cocofmt.json")
+            convert_ytbvis_to_cocovid_pred(
+                youtubevis_pred_path=processed_pred_file,
+                converted_dataset_path=gt_coco_path,
+                output_path=pred_coco_path,
+            )
+            # Configure TETA evaluator
+            default_eval_config = config.get_default_eval_config()
+            default_eval_config["PRINT_ONLY_COMBINED"] = True
+            default_eval_config["DISPLAY_LESS_PROGRESS"] = True
+            default_eval_config["OUTPUT_TEMP_RAW_DATA"] = True
+            default_eval_config["NUM_PARALLEL_CORES"] = self.num_parallel_cores
+            default_dataset_config = config.get_default_dataset_config()
+            default_dataset_config["TRACKERS_TO_EVAL"] = [self.tracker_name]
+            default_dataset_config["GT_FOLDER"] = gt_dir
+            default_dataset_config["OUTPUT_FOLDER"] = pred_dir
+            default_dataset_config["TRACKER_SUB_FOLDER"] = tracker_dir
+            default_dataset_config["USE_MASK"] = self.use_mask
+            evaluator = Evaluator(default_eval_config)
+            if self.is_exhaustive:
+                dataset_list = [COCO(default_dataset_config)]
+                dataset_parsing_key = "COCO"
+            else:
+                dataset_list = [TAO(default_dataset_config)]
+                dataset_parsing_key = "TAO"
+            # Run evaluation
+            eval_results, _ = evaluator.evaluate(
+                dataset_list, [metrics.TETA(exhaustive=self.is_exhaustive)]
+            )
+            # Extract and format results
+            results = {
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_teta": float(
+                    eval_results[dataset_parsing_key]["TETA"][0]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_a": float(
+                    eval_results[dataset_parsing_key]["TETA"][1]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_a": float(
+                    eval_results[dataset_parsing_key]["TETA"][2]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_a": float(
+                    eval_results[dataset_parsing_key]["TETA"][3]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_re": float(
+                    eval_results[dataset_parsing_key]["TETA"][4]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_loc_pr": float(
+                    eval_results[dataset_parsing_key]["TETA"][5]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_re": float(
+                    eval_results[dataset_parsing_key]["TETA"][6]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_assoc_pr": float(
+                    eval_results[dataset_parsing_key]["TETA"][7]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_re": float(
+                    eval_results[dataset_parsing_key]["TETA"][8]
+                ),
+                f"{self.dataset_name}_{'mask' if self.use_mask else 'bbox'}_cls_pr": float(
+                    eval_results[dataset_parsing_key]["TETA"][9]
+                ),
+            }
+        # video-NP level results not supported for `VideoTetaEvaluator` yet
+        video_np_level_results = {}
+        return results, video_np_level_results
+class VideoPhraseHotaEvaluator(BasePredFileEvaluator):
+    """Evaluate Video Phrase HOTA with YT-VIS format prediction and GT files."""
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        prob_thresh: float = 0.5,
+        iou_types: Optional[Sequence[str]] = None,
+        compute_video_mot_hota: bool = False,
+    ):
+        self.gt_ann_file = gt_ann_file
+        self.dataset_name = dataset_name
+        self.prob_thresh = prob_thresh
+        self.metric_prefix = "phrase"
+        # the list of metrics to collect from the HOTA evaluation results
+        self.metric_to_collect = [
+            "HOTA",
+            "DetA",
+            "AssA",
+            "DetRe",
+            "DetPr",
+            "AssRe",
+            "AssPr",
+            "LocA",
+            "OWTA",
+        ]
+        self.iou_types = list(iou_types) if iou_types is not None else ["bbox", "segm"]
+        assert all(iou_type in ["bbox", "segm"] for iou_type in self.iou_types)
+        # If True, compute video MOT HOTA, aggregating predictions/GT from all categories.
+        self.compute_video_mot_hota = compute_video_mot_hota
+    def evaluate(self, pred_file: str) -> Dict[str, float]:
+        # use the YT-VIS evaluation toolkit in TrackEval
+        with open(self.gt_ann_file) as f:
+            gt = json.load(f)
+        with open(pred_file) as f:
+            dt = json.load(f)
+        # keep only predictions with score above the probability threshold
+        dt = [d for d in dt if d["score"] > self.prob_thresh]
+        for d in dt:
+            assert len(d["areas"]) == len(d["bboxes"])
+            assert len(d["areas"]) == len(d["segmentations"])
+            # remove empty boxes (otherwise they will count as false positives for during
+            # per-frame detection accuracy in HOTA evaluation)
+            for t in range(len(d["bboxes"])):
+                bbox = d["bboxes"][t]
+                if d["areas"][t] == 0 or bbox is None or all(x == 0 for x in bbox):
+                    d["segmentations"][t] = None
+                    d["bboxes"][t] = None
+                    d["areas"][t] = None
+            # check that box occurence and mask occurence are consistent
+            for bbox, mask, area in zip(d["bboxes"], d["segmentations"], d["areas"]):
+                assert (area is None) == (bbox is None)
+                assert (area is None) == (mask is None)
+            # set all scores to 1.0 for HOTA evaluation (just like Demo F1, the exact score
+            # value is not used in HOTA metrics; it will be treated as a detection prediction
+            # as long as its score is above the threshold)
+            d["score"] = 1.0
+        # remap the GT and DT annotations for phrase HOTA evaluation
+        gt = _fill_in_ann_height_width(gt)
+        if not self.compute_video_mot_hota:
+            # remap the GT and DT annotations for phrase HOTA evaluation
+            gt, dt = self._remap_gt_dt(gt, dt)
+        else:
+            # Compute video-level MOT HOTA
+            # Apply track-level NMS
+            video_groups = defaultdict(list)
+            for pred in dt:
+                video_groups[pred["video_id"]].append(pred)
+            process_track_level_nms(video_groups, nms_threshold=0.5)
+            dt = [track for tracks in video_groups.values() for track in tracks]
+            # Remap GT track ids for class-agnostic HOTA
+            gt, dt = remap_gt_dt_class_agnostic(gt, dt)
+        # run the HOTA evaluation using TrackEval on the remapped (video_id, category_id) pairs
+        out_dict = {}
+        video_np_level_results = {}
+        for iou_type in self.iou_types:
+            output_res, _ = run_ytvis_eval(
+                args=[
+                    "--METRICS",
+                    "HOTA",
+                    "--IOU_TYPE",
+                    iou_type,
+                    "--DATASET_NAME",
+                    self.dataset_name,
+                    "--USE_PARALLEL",
+                    "True",
+                    "--NUM_PARALLEL_CORES",
+                    "8",
+                    "--PLOT_CURVES",
+                    "False",
+                    "--LOG_ON_ERROR",
+                    "None",
+                    "--PRINT_ONLY_COMBINED",
+                    "True",
+                    "--OUTPUT_SUMMARY",
+                    "False",
+                    "--OUTPUT_DETAILED",
+                    "False",
+                    "--TIME_PROGRESS",
+                    "False",
+                    "--PRINT_CONFIG",
+                    "False",
+                ],
+                gt_json=gt,
+                dt_json=dt,
+            )
+            self.extract_video_np_level_results(
+                iou_type=iou_type,
+                remapped_gt=gt,
+                raw_results=output_res[self.dataset_name]["tracker"],
+                video_np_level_results=video_np_level_results,
+            )
+            def _summarize_results(output_res, iou_type, field, suffix):
+                eval_res = output_res[self.dataset_name]["tracker"][field]
+                result_prefix = f"{self.dataset_name}_{'mask' if iou_type == 'segm' else 'bbox'}_{suffix}"
+                for metric_name in self.metric_to_collect:
+                    eval_res_hota = eval_res["cls_comb_cls_av"]["HOTA"]
+                    result_key = f"{result_prefix}_{self.metric_prefix}_{metric_name}"
+                    result_value = float(np.mean(eval_res_hota[metric_name]))
+                    out_dict[result_key] = result_value
+            _summarize_results(output_res, iou_type, "COMBINED_SEQ", "all")
+            if "COMBINED_SEQ_CHALLENGING" in output_res[self.dataset_name]["tracker"]:
+                _summarize_results(
+                    output_res, iou_type, "COMBINED_SEQ_CHALLENGING", "challenging"
+                )
+        # video-NP level results not supported for `VideoPhraseHotaEvaluator` yet
+        return out_dict, video_np_level_results
+    def _remap_gt_dt(self, gt, dt):
+        # For phrase HOTA evaluation, we need to remap each pair of (video_id, category_id) to
+        # a new unique video_id, so that we don't mix detections from different categories
+        gt, dt = remap_video_category_pairs_to_unique_video_ids(gt, dt)
+        # We further map all the categories to category_id=1 in HOTA evaluation toolkit
+        # for phrase HOTA (similar to "useCat=False" for video phrase AP)
+        remapped_category_id = 1
+        gt["categories"] = [
+            {
+                "supercategory": "object",
+                "id": remapped_category_id,
+                "name": "_REMAPPED_FOR_PHRASE_METRICS_",
+            }
+        ]
+        for ann in gt["annotations"]:
+            ann["category_id"] = remapped_category_id
+        for d in dt:
+            d["category_id"] = remapped_category_id
+        # To be compatible with the TrackEval YT-VIS evaluation toolkit, we need to give
+        # unique filenames to each remapped video, so we add remapped video_id as prefix.
+        for video in gt["videos"]:
+            new_video_id = video["id"]
+            video["file_names"] = [
+                f"remapped_vid_{new_video_id:012d}/{name}"
+                for name in video["file_names"]
+            ]
+        return gt, dt
+    def extract_video_np_level_results(
+        self, iou_type, remapped_gt, raw_results, video_np_level_results
+    ):
+        """Aggregate statistics for video-level metrics."""
+        result_prefix = "mask" if iou_type == "segm" else "bbox"
+        for video in remapped_gt["videos"]:
+            # the original video id and category id before remapping
+            video_id = video["orig_video_id"]
+            category_id = video["orig_category_id"]
+            video_key = f"remapped_vid_{video['id']:012d}"
+            results = raw_results[video_key]["_REMAPPED_FOR_PHRASE_METRICS_"]["HOTA"]
+            local_results = {}
+            for metric_name in self.metric_to_collect:
+                result_key = f"{result_prefix}_{metric_name}"
+                local_results[result_key] = float(results[metric_name].mean())
+            if (video_id, category_id) not in video_np_level_results:
+                video_np_level_results[(video_id, category_id)] = {}
+            video_np_level_results[(video_id, category_id)].update(local_results)
+class VideoClassBasedHotaEvaluator(VideoPhraseHotaEvaluator):
+    def __init__(
+        self,
+        gt_ann_file: str,
+        dataset_name: str = "video",
+        prob_thresh: float = 0.5,
+    ):
+        super().__init__(gt_ann_file, dataset_name, prob_thresh)
+        self.metric_prefix = "class"
+    def _remap_gt_dt(self, gt, dt):
+        return gt, dt  # no remapping needed for class-based HOTA evaluation
+    def extract_video_np_level_results(self, *args, **kwargs):
+        pass  # no video-NP level results for class-based HOTA evaluation
+def _compress_rle(rle):
+    """Convert RLEs from uncompressed (integer list) to compressed (string) format."""
+    if rle is None:
+        return None
+    if isinstance(rle["counts"], list):
+        rle = pycocotools.mask.frPyObjects(rle, rle["size"][0], rle["size"][1])
+        rle["counts"] = rle["counts"].decode()
+    return rle
+def remap_video_category_pairs_to_unique_video_ids(
+    gt_json, dt_json, add_negative_np_pairs=False
+):
+    """
+    Remap each pair of (video_id, category_id) to a new unique video_id. This is useful
+    for phrase AP and demo F1 evaluation on videos, where we have `useCat=False` and
+    rely on separating different NPs (from the same video) into different new video ids,
+    so that we don't mix detections from different categories in computeIoU under `useCat=False`.
+    This is consistent with how do we phrase AP and demo F1 evaluation on images, where we
+    use a remapped unique coco_image_id for each image-NP pair (based in its query["id"] in
+    CustomCocoDetectionAPI.load_queries in modulated_detection_api.py)
+    """
+    # collect the unique video_id-category_id pairs
+    video_id_to_video = {v["id"]: v for v in gt_json["videos"]}
+    video_id_category_id_pairs = set()
+    for pred in dt_json:
+        video_id_category_id_pairs.add((pred["video_id"], pred["category_id"]))
+    for ann in gt_json["annotations"]:
+        video_id_category_id_pairs.add((ann["video_id"], ann["category_id"]))
+    # assign the video_id-category_id pairs to unique video ids
+    video_id_category_id_pairs = sorted(video_id_category_id_pairs)
+    video_id_category_id_to_new_video_id = {
+        pair: (i + 1) for i, pair in enumerate(video_id_category_id_pairs)
+    }
+    # also map the negative NP pairs -- this is needed for IL_MCC and CG-F1 evaluation
+    if add_negative_np_pairs:
+        for vnp in gt_json["video_np_pairs"]:
+            pair = (vnp["video_id"], vnp["category_id"])
+            if pair not in video_id_category_id_to_new_video_id:
+                video_id_category_id_to_new_video_id[pair] = (
+                    len(video_id_category_id_to_new_video_id) + 1
+                )
+    # map the "video_id" in predictions
+    for pred in dt_json:
+        pred["video_id"] = video_id_category_id_to_new_video_id[
+            (pred["video_id"], pred["category_id"])
+        ]
+    # map the "video_id" in gt_json["annotations"]
+    for ann in gt_json["annotations"]:
+        ann["video_id"] = video_id_category_id_to_new_video_id[
+            (ann["video_id"], ann["category_id"])
+        ]
+    # map and duplicate gt_json["videos"]
+    new_videos = []
+    for (
+        video_id,
+        category_id,
+    ), new_video_id in video_id_category_id_to_new_video_id.items():
+        video = video_id_to_video[video_id].copy()
+        video["id"] = new_video_id
+        # preserve the original video_id and category_id of each remapped video entry,
+        # so that we can associate sample-level eval metrics with the original video-NP pairs
+        video["orig_video_id"] = video_id
+        video["orig_category_id"] = category_id
+        new_videos.append(video)
+    gt_json["videos"] = new_videos
+    return gt_json, dt_json
+def remap_gt_dt_class_agnostic(gt, dt):
+    """
+    For class-agnostic HOTA, merge all GT tracks for each video (across NPs),
+    ensure unique track_ids, and set all category_id to 1.
+    Also, add orig_video_id and orig_category_id for compatibility.
+    """
+    # 1. Remap all GT track_ids to be unique per video
+    gt_anns_by_video = defaultdict(list)
+    for ann in gt["annotations"]:
+        gt_anns_by_video[ann["video_id"]].append(ann)
+    # Ensure unique track ids across tracks of all videos
+    next_tid = 1
+    for _, anns in gt_anns_by_video.items():
+        # Map old track_ids to new unique ones
+        old_to_new_tid = {}
+        for ann in anns:
+            old_tid = ann["id"]
+            if old_tid not in old_to_new_tid:
+                old_to_new_tid[old_tid] = next_tid
+                next_tid += 1
+            ann["id"] = old_to_new_tid[old_tid]
+            # Set category_id to 1 for class-agnostic
+            ann["category_id"] = 1
+    # Set all GT categories to a single category
+    gt["categories"] = [
+        {
+            "supercategory": "object",
+            "id": 1,
+            "name": "_REMAPPED_FOR_PHRASE_METRICS_",
+        }
+    ]
+    # Add orig_video_id and orig_category_id to each video for compatibility
+    anns_by_video = defaultdict(list)
+    for ann in gt["annotations"]:
+        anns_by_video[ann["video_id"]].append(ann)
+    for video in gt["videos"]:
+        video["orig_video_id"] = video["id"]
+        # Use the first annotation's original category_id if available, else None
+        orig_cat = (
+            anns_by_video[video["id"]][0]["category_id"]
+            if anns_by_video[video["id"]]
+            else None
+        )
+        video["orig_category_id"] = orig_cat
+        video["file_names"] = [
+            f"remapped_vid_{video['id']:012d}/{name}" for name in video["file_names"]
+        ]
+    # Set all DT category_id to 1
+    for d in dt:
+        d["category_id"] = 1
+    return gt, dt
+def _fill_in_ann_height_width(gt_json):
+    """Fill in missing height/width in GT annotations from its video info."""
+    video_id_to_video = {v["id"]: v for v in gt_json["videos"]}
+    for ann in gt_json["annotations"]:
+        if "height" not in ann or "width" not in ann:
+            video = video_id_to_video[ann["video_id"]]
+            if "height" not in ann:
+                ann["height"] = video["height"]
+            if "width" not in ann:
+                ann["width"] = video["width"]
+    return gt_json

sam3/eval/teta_eval_toolkit/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# fmt: off
+# flake8: noqa
+from . import config, datasets, metrics, utils
+from .eval import Evaluator

sam3/eval/teta_eval_toolkit/_timing.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# fmt: off
+# flake8: noqa
+import inspect
+from functools import wraps
+from time import perf_counter
+DO_TIMING = False
+DISPLAY_LESS_PROGRESS = False
+timer_dict = {}
+counter = 0
+def time(f):
+    @wraps(f)
+    def wrap(*args, **kw):
+        if DO_TIMING:
+            # Run function with timing
+            ts = perf_counter()
+            result = f(*args, **kw)
+            te = perf_counter()
+            tt = te - ts
+            # Get function name
+            arg_names = inspect.getfullargspec(f)[0]
+            if arg_names[0] == "self" and DISPLAY_LESS_PROGRESS:
+                return result
+            elif arg_names[0] == "self":
+                method_name = type(args[0]).__name__ + "." + f.__name__
+            else:
+                method_name = f.__name__
+            # Record accumulative time in each function for analysis
+            if method_name in timer_dict.keys():
+                timer_dict[method_name] += tt
+            else:
+                timer_dict[method_name] = tt
+            # If code is finished, display timing summary
+            if method_name == "Evaluator.evaluate":
+                print("")
+                print("Timing analysis:")
+                for key, value in timer_dict.items():
+                    print("%-70s %2.4f sec" % (key, value))
+            else:
+                # Get function argument values for printing special arguments of interest
+                arg_titles = ["tracker", "seq", "cls"]
+                arg_vals = []
+                for i, a in enumerate(arg_names):
+                    if a in arg_titles:
+                        arg_vals.append(args[i])
+                arg_text = "(" + ", ".join(arg_vals) + ")"
+                # Display methods and functions with different indentation.
+                if arg_names[0] == "self":
+                    print("%-74s %2.4f sec" % (" " * 4 + method_name + arg_text, tt))
+                elif arg_names[0] == "test":
+                    pass
+                else:
+                    global counter
+                    counter += 1
+                    print("%i %-70s %2.4f sec" % (counter, method_name + arg_text, tt))
+            return result
+        else:
+            # If config["TIME_PROGRESS"] is false, or config["USE_PARALLEL"] is true, run functions normally without timing.
+            return f(*args, **kw)
+    return wrap