Spaces:

Sathvik0101
/

obj_localizer

Running

File size: 6,541 Bytes

"""SpaceDebris Localizer - Gradio application.

Uses nvidia/LocateAnything-3B to locate space debris, satellite fragments,
and spacecraft components in space imagery.
"""

from __future__ import annotations

import json
import logging
import os

import gradio as gr
from PIL import Image

from src.config import APP_SUBTITLE, APP_TITLE
from src.inference import LocateAnythingWorker, run_localization
from src.prompts import get_example_prompts
from src.utils import ensure_rgb, format_json_output, format_metadata, validate_image

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

worker: LocateAnythingWorker | None = None


def get_worker() -> LocateAnythingWorker:
    """Lazy-load the model worker on first use."""
    global worker
    if worker is None:
        logger.info("Loading LocateAnything-3B model...")
        worker = LocateAnythingWorker()
        worker.load()
        logger.info("Model loaded successfully.")
    return worker


def run_inference(
    image: Image.Image | None,
    prompt: str,
) -> tuple[Image.Image | None, str, str, str, str]:
    """Main inference function for Gradio interface.

    Returns:
        (annotated_image, metadata, raw_output, json_output, status_message)
    """
    is_valid, error_msg = validate_image(image)
    if not is_valid:
        return None, "", "", "", f"Error: {error_msg}"

    if not prompt or not prompt.strip():
        return None, "", "", "", "Error: Please enter a detection prompt."

    try:
        image_rgb = ensure_rgb(image)
        w = get_worker()
        annotated, raw_output, parsed = run_localization(image_rgb, prompt.strip(), worker=w)

        metadata = format_metadata(parsed)
        json_out = format_json_output(parsed)
        json_str = json.dumps(json_out, indent=2, ensure_ascii=False)

        status = f"Done. Found {parsed.num_detections} object(s)."
        if parsed.parse_errors:
            status += f" ({len(parsed.parse_errors)} warning(s))"

        return annotated, metadata, raw_output, json_str, status

    except Exception as exc:
        logger.exception("Inference failed")
        return None, "", "", "", f"Inference error: {exc}"


def build_app() -> gr.Blocks:
    """Build the Gradio Blocks interface."""
    with gr.Blocks(
        title=APP_TITLE,
        theme=gr.themes.Soft(),
        css="""
        .main-title { text-align: center; margin-bottom: 0; }
        .subtitle { text-align: center; color: #666; margin-top: 0; }
        .footer { text-align: center; color: #999; font-size: 0.85em; margin-top: 20px; }
        """,
    ) as app:
        gr.HTML(f"""
            <h1 class="main-title">{APP_TITLE}</h1>
            <p class="subtitle">{APP_SUBTITLE}</p>
        """)

        gr.Markdown("""
        > **How it works:** Upload a space or satellite image and enter a natural-language
        > prompt describing what to locate. The model grounds your query in the image and
        > returns bounding box coordinates. Detection quality depends on image resolution,
        > object visibility, and model grounding capability.
        """)

        with gr.Row():
            with gr.Column(scale=1):
                input_image = gr.Image(type="pil", label="Upload Image")
                prompt_input = gr.Textbox(
                    label="Detection Prompt",
                    placeholder="e.g. Locate all the instances that match the following description: space debris.",
                    lines=2,
                )
                run_btn = gr.Button("Run Localization", variant="primary", size="lg")
                status_text = gr.Textbox(label="Status", interactive=False, lines=1)

            with gr.Column(scale=1):
                output_image = gr.Image(type="pil", label="Annotated Image")
                with gr.Tabs():
                    with gr.TabItem("Metadata"):
                        metadata_output = gr.Textbox(label="Detection Metadata", lines=6, interactive=False)
                    with gr.TabItem("Raw Output"):
                        raw_output = gr.Textbox(label="Raw Model Output", lines=8, interactive=False, show_copy_button=True)
                    with gr.TabItem("JSON Output"):
                        json_output = gr.Code(label="Parsed JSON", language="json", lines=8)

        gr.Markdown("### Example Prompts")
        gr.Markdown("Click an example to load it into the prompt field.")
        examples_list = get_example_prompts()
        gr.Examples(
            examples=examples_list,
            inputs=[prompt_input],
            label="Space Debris Prompts",
        )

        with gr.Accordion("About This Project", open=False):
            gr.Markdown("""
            **SpaceDebris Localizer** is a hackathon prototype demonstrating how NVIDIA's
            **LocateAnything-3B** vision-language model can be applied to orbital debris
            localization and satellite component identification.

            ### Capabilities
            - Open-set object detection from natural-language prompts
            - Bounding-box grounding for arbitrary visual concepts
            - Structured output with pixel-coordinate parsing

            ### Limitations
            - The model was trained on general grounding data, not specifically orbital imagery
            - Detection quality depends heavily on image resolution and object clarity
            - Small debris fragments may not be reliably detected
            - This is a proof-of-concept, not a production debris tracking system

            ### Model
            - [nvidia/LocateAnything-3B](https://huggingface.co/nvidia/LocateAnything-3B) on Hugging Face
            - 3B parameter vision-language model with Parallel Box Decoding
            - Coordinates are normalized to [0, 1000] and converted to pixel space
            """)

        gr.HTML('<p class="footer">Powered by nvidia/LocateAnything-3B | SpaceDebris Localizer</p>')

        run_btn.click(
            fn=run_inference,
            inputs=[input_image, prompt_input],
            outputs=[output_image, metadata_output, raw_output, json_output, status_text],
        )
        prompt_input.submit(
            fn=run_inference,
            inputs=[input_image, prompt_input],
            outputs=[output_image, metadata_output, raw_output, json_output, status_text],
        )

    return app


if __name__ == "__main__":
    app = build_app()
    app.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))