File size: 6,541 Bytes
23db765
 
 
 
 
 
 
 
cf388f7
23db765
 
 
 
 
 
 
 
cf388f7
23db765
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""SpaceDebris Localizer - Gradio application.

Uses nvidia/LocateAnything-3B to locate space debris, satellite fragments,
and spacecraft components in space imagery.
"""

from __future__ import annotations

import json
import logging
import os

import gradio as gr
from PIL import Image

from src.config import APP_SUBTITLE, APP_TITLE
from src.inference import LocateAnythingWorker, run_localization
from src.prompts import get_example_prompts
from src.utils import ensure_rgb, format_json_output, format_metadata, validate_image

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

worker: LocateAnythingWorker | None = None


def get_worker() -> LocateAnythingWorker:
    """Lazy-load the model worker on first use."""
    global worker
    if worker is None:
        logger.info("Loading LocateAnything-3B model...")
        worker = LocateAnythingWorker()
        worker.load()
        logger.info("Model loaded successfully.")
    return worker


def run_inference(
    image: Image.Image | None,
    prompt: str,
) -> tuple[Image.Image | None, str, str, str, str]:
    """Main inference function for Gradio interface.

    Returns:
        (annotated_image, metadata, raw_output, json_output, status_message)
    """
    is_valid, error_msg = validate_image(image)
    if not is_valid:
        return None, "", "", "", f"Error: {error_msg}"

    if not prompt or not prompt.strip():
        return None, "", "", "", "Error: Please enter a detection prompt."

    try:
        image_rgb = ensure_rgb(image)
        w = get_worker()
        annotated, raw_output, parsed = run_localization(image_rgb, prompt.strip(), worker=w)

        metadata = format_metadata(parsed)
        json_out = format_json_output(parsed)
        json_str = json.dumps(json_out, indent=2, ensure_ascii=False)

        status = f"Done. Found {parsed.num_detections} object(s)."
        if parsed.parse_errors:
            status += f" ({len(parsed.parse_errors)} warning(s))"

        return annotated, metadata, raw_output, json_str, status

    except Exception as exc:
        logger.exception("Inference failed")
        return None, "", "", "", f"Inference error: {exc}"


def build_app() -> gr.Blocks:
    """Build the Gradio Blocks interface."""
    with gr.Blocks(
        title=APP_TITLE,
        theme=gr.themes.Soft(),
        css="""
        .main-title { text-align: center; margin-bottom: 0; }
        .subtitle { text-align: center; color: #666; margin-top: 0; }
        .footer { text-align: center; color: #999; font-size: 0.85em; margin-top: 20px; }
        """,
    ) as app:
        gr.HTML(f"""
            <h1 class="main-title">{APP_TITLE}</h1>
            <p class="subtitle">{APP_SUBTITLE}</p>
        """)

        gr.Markdown("""
        > **How it works:** Upload a space or satellite image and enter a natural-language
        > prompt describing what to locate. The model grounds your query in the image and
        > returns bounding box coordinates. Detection quality depends on image resolution,
        > object visibility, and model grounding capability.
        """)

        with gr.Row():
            with gr.Column(scale=1):
                input_image = gr.Image(type="pil", label="Upload Image")
                prompt_input = gr.Textbox(
                    label="Detection Prompt",
                    placeholder="e.g. Locate all the instances that match the following description: space debris.",
                    lines=2,
                )
                run_btn = gr.Button("Run Localization", variant="primary", size="lg")
                status_text = gr.Textbox(label="Status", interactive=False, lines=1)

            with gr.Column(scale=1):
                output_image = gr.Image(type="pil", label="Annotated Image")
                with gr.Tabs():
                    with gr.TabItem("Metadata"):
                        metadata_output = gr.Textbox(label="Detection Metadata", lines=6, interactive=False)
                    with gr.TabItem("Raw Output"):
                        raw_output = gr.Textbox(label="Raw Model Output", lines=8, interactive=False, show_copy_button=True)
                    with gr.TabItem("JSON Output"):
                        json_output = gr.Code(label="Parsed JSON", language="json", lines=8)

        gr.Markdown("### Example Prompts")
        gr.Markdown("Click an example to load it into the prompt field.")
        examples_list = get_example_prompts()
        gr.Examples(
            examples=examples_list,
            inputs=[prompt_input],
            label="Space Debris Prompts",
        )

        with gr.Accordion("About This Project", open=False):
            gr.Markdown("""
            **SpaceDebris Localizer** is a hackathon prototype demonstrating how NVIDIA's
            **LocateAnything-3B** vision-language model can be applied to orbital debris
            localization and satellite component identification.

            ### Capabilities
            - Open-set object detection from natural-language prompts
            - Bounding-box grounding for arbitrary visual concepts
            - Structured output with pixel-coordinate parsing

            ### Limitations
            - The model was trained on general grounding data, not specifically orbital imagery
            - Detection quality depends heavily on image resolution and object clarity
            - Small debris fragments may not be reliably detected
            - This is a proof-of-concept, not a production debris tracking system

            ### Model
            - [nvidia/LocateAnything-3B](https://huggingface.co/nvidia/LocateAnything-3B) on Hugging Face
            - 3B parameter vision-language model with Parallel Box Decoding
            - Coordinates are normalized to [0, 1000] and converted to pixel space
            """)

        gr.HTML('<p class="footer">Powered by nvidia/LocateAnything-3B | SpaceDebris Localizer</p>')

        run_btn.click(
            fn=run_inference,
            inputs=[input_image, prompt_input],
            outputs=[output_image, metadata_output, raw_output, json_output, status_text],
        )
        prompt_input.submit(
            fn=run_inference,
            inputs=[input_image, prompt_input],
            outputs=[output_image, metadata_output, raw_output, json_output, status_text],
        )

    return app


if __name__ == "__main__":
    app = build_app()
    app.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))